From beeca08738c4c4024c81a591812bfe38f8c436c0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 28 Sep 2005 20:29:44 +1000
Subject: Don't call a NULL ack function in the generic IRQ code.

Some IRQ controllers don't need an ack function (e.g. OpenPIC on
PPC platforms) and for them we'd rather not have the overhead
of doing an indirect call to a function that does nothing.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/irq/handle.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3ff7b925c387..51df337b37db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		desc->handler->ack(irq);
+		if (desc->handler->ack)
+			desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
 		desc->handler->end(irq);
 		return 1;
 	}
 
 	spin_lock(&desc->lock);
-	desc->handler->ack(irq);
+	if (desc->handler->ack)
+		desc->handler->ack(irq);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
-- 
cgit v1.2.3


From 9465bee863bc4c6cf1566c12d6f92a8133e3da5c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 21 Oct 2005 15:36:00 -0700
Subject: Revert "Fix cpu timers exit deadlock and races"

Revert commit e03d13e985d48ac4885382c9e3b1510c78bd047f, to be replaced
by a much nicer fix from Roland.
---
 kernel/posix-cpu-timers.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b3f3edc475de..7a51a5597c33 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -387,19 +387,25 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 	if (unlikely(p == NULL))
 		return 0;
 
-	spin_lock(&p->sighand->siglock);
 	if (!list_empty(&timer->it.cpu.entry)) {
-		/*
-		 * Take us off the task's timer list.  We don't need to
-		 * take tasklist_lock and check for the task being reaped.
-		 * If it was reaped, it already called posix_cpu_timers_exit
-		 * and posix_cpu_timers_exit_group to clear all the timers
-		 * that pointed to it.
-		 */
-		list_del(&timer->it.cpu.entry);
-		put_task_struct(p);
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * We raced with the reaping of the task.
+			 * The deletion should have cleared us off the list.
+			 */
+			BUG_ON(!list_empty(&timer->it.cpu.entry));
+		} else {
+			/*
+			 * Take us off the task's timer list.
+			 */
+			spin_lock(&p->sighand->siglock);
+			list_del(&timer->it.cpu.entry);
+			spin_unlock(&p->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
 	}
-	spin_unlock(&p->sighand->siglock);
+	put_task_struct(p);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 25f407f0b668f5e4ebd5d13e1fb4306ba6427ead Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 21 Oct 2005 15:03:29 -0700
Subject: [PATCH] Call exit_itimers from do_exit, not __exit_signal

When I originally moved exit_itimers into __exit_signal, that was the only
place where we could reliably know it was the last thread in the group
dying, without races.  Since then we've gotten the signal_struct.live
counter, and do_exit can reliably do group-wide cleanup work.

This patch moves the call to do_exit, where it's made without locks.  This
avoids the deadlock issues that the old __exit_signal code's comment talks
about, and the one that Oleg found recently with process CPU timers.

[ This replaces e03d13e985d48ac4885382c9e3b1510c78bd047f, which is why
  it was just reverted. ]

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c         |  1 +
 kernel/posix-timers.c |  2 +-
 kernel/signal.c       | 14 +-------------
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 43077732619b..3b25b182d2be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -843,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
+		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
 	exit_mm(tsk);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b7b532acd9fc..dda3cda73c77 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1157,7 +1157,7 @@ retry_delete:
 }
 
 /*
- * This is called by __exit_signal, only when there are no more
+ * This is called by do_exit or de_thread, only when there are no more
  * references to the shared signal_struct.
  */
 void exit_itimers(struct signal_struct *sig)
diff --git a/kernel/signal.c b/kernel/signal.c
index 50c992643771..f2b96b08fb44 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -397,20 +397,8 @@ void __exit_signal(struct task_struct *tsk)
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		/*
-		 * We are cleaning up the signal_struct here.  We delayed
-		 * calling exit_itimers until after flush_sigqueue, just in
-		 * case our thread-local pending queue contained a queued
-		 * timer signal that would have been cleared in
-		 * exit_itimers.  When that called sigqueue_free, it would
-		 * attempt to re-take the tasklist_lock and deadlock.  This
-		 * can never happen if we ensure that all queues the
-		 * timer's signal might be queued on have been flushed
-		 * first.  The shared_pending queue, and our own pending
-		 * queue are the only queues the timer could be on, since
-		 * there are no other threads left in the group and timer
-		 * signals are constrained to threads inside the group.
+		 * We are cleaning up the signal_struct here.
 		 */
-		exit_itimers(sig);
 		exit_thread_group_keys(sig);
 		kmem_cache_free(signal_cachep, sig);
 	}
-- 
cgit v1.2.3


From e80eda94d3eaf1d12cfc97878eff77cd679dabc9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 23 Oct 2005 10:02:50 -0700
Subject: Posix timers: limit number of timers firing at once

Bursty timers aren't good for anybody, very much including latency for
other programs when we trigger lots of timers in interrupt context.  So
set a random limit, after which we'll handle the rest on the next timer
tick.

Noted by Oleg Nesterov <oleg@tv-sign.ru>

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7a51a5597c33..d30b304a3384 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -961,14 +961,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
+	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 
+	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
 			tsk->it_prof_expires = t->expires.cpu;
 			break;
 		}
@@ -977,12 +979,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
 			tsk->it_virt_expires = t->expires.cpu;
 			break;
 		}
@@ -991,12 +994,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (tsk->sched_time < t->expires.sched) {
+		if (!--maxfire || tsk->sched_time < t->expires.sched) {
 			tsk->it_sched_expires = t->expires.sched;
 			break;
 		}
@@ -1013,6 +1017,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
+	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
 	unsigned long long sched_time, sched_expires;
@@ -1045,12 +1050,13 @@ static void check_process_timers(struct task_struct *tsk,
 	} while (t != tsk);
 	ptime = cputime_add(utime, stime);
 
+	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(ptime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
 			prof_expires = t->expires.cpu;
 			break;
 		}
@@ -1059,12 +1065,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(utime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
 			virt_expires = t->expires.cpu;
 			break;
 		}
@@ -1073,12 +1080,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (sched_time < t->expires.sched) {
+		if (!--maxfire || sched_time < t->expires.sched) {
 			sched_expires = t->expires.sched;
 			break;
 		}
-- 
cgit v1.2.3


From 108150ea78003044e41150c75259447b2c0953b6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 23 Oct 2005 20:25:39 +0400
Subject: [PATCH] posix-timers: fix cleanup_timers() and run_posix_cpu_timers()
 races

1. cleanup_timers() sets timer->task = NULL under tasklist + ->sighand locks.
   That means that this code in posix_cpu_timer_del() and posix_cpu_timer_set()

   		lock_timer(timer);
		if (timer->task == NULL)
			return;
		read_lock(tasklist);
		put_task_struct(timer->task)

   is racy. With this patch timer->task modified and accounted only under
   timer->it_lock. Sadly, this means that dead task_struct won't be freed
   until timer deleted or armed.

2. run_posix_cpu_timers() collects expired timers into local list under
   tasklist + ->sighand again. That means that posix_cpu_timer_del()
   should check timer->it.cpu.firing under these locks too.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d30b304a3384..30ab39a27736 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
+	int ret = 0;
 
-	if (timer->it.cpu.firing)
-		return TIMER_RETRY;
-
-	if (unlikely(p == NULL))
-		return 0;
-
-	if (!list_empty(&timer->it.cpu.entry)) {
+	if (likely(p != NULL)) {
 		read_lock(&tasklist_lock);
 		if (unlikely(p->signal == NULL)) {
 			/*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 			 */
 			BUG_ON(!list_empty(&timer->it.cpu.entry));
 		} else {
-			/*
-			 * Take us off the task's timer list.
-			 */
 			spin_lock(&p->sighand->siglock);
-			list_del(&timer->it.cpu.entry);
+			if (timer->it.cpu.firing)
+				ret = TIMER_RETRY;
+			else
+				list_del(&timer->it.cpu.entry);
 			spin_unlock(&p->sighand->siglock);
 		}
 		read_unlock(&tasklist_lock);
+
+		if (!ret)
+			put_task_struct(p);
 	}
-	put_task_struct(p);
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -424,8 +421,6 @@ static void cleanup_timers(struct list_head *head,
 	cputime_t ptime = cputime_add(utime, stime);
 
 	list_for_each_entry_safe(timer, next, head, entry) {
-		put_task_struct(timer->task);
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, ptime)) {
 			timer->expires.cpu = cputime_zero;
@@ -437,8 +432,6 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
-		put_task_struct(timer->task);
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, utime)) {
 			timer->expires.cpu = cputime_zero;
@@ -450,8 +443,6 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
-		put_task_struct(timer->task);
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (timer->expires.sched < sched_time) {
 			timer->expires.sched = 0;
-- 
cgit v1.2.3


From 3de463c7d9d58f8cf3395268230cb20a4c15bffa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 24 Oct 2005 14:34:03 +0400
Subject: [PATCH] posix-timers: remove false BUG_ON() from
 run_posix_cpu_timers()

do_exit() clears ->it_##clock##_expires, but nothing prevents
another cpu to attach the timer to exiting process after that.

After exit_notify() does 'write_unlock_irq(&tasklist_lock)' and
before do_exit() calls 'schedule() local timer interrupt can find
tsk->exit_state != 0. If that state was EXIT_DEAD (or another cpu
does sys_wait4) interrupted task has ->signal == NULL.

At this moment exiting task has no pending cpu timers, they were cleaned
up in __exit_signal()->posix_cpu_timers_exit{,_group}(), so we can just
return from irq.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c             |  8 --------
 kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..4897977a1f4b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -825,14 +825,6 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	tsk->flags |= PF_EXITING;
 
-	/*
-	 * Make sure we don't try to process any timer firings
-	 * while we are already exiting.
-	 */
- 	tsk->it_virt_expires = cputime_zero;
- 	tsk->it_prof_expires = cputime_zero;
-	tsk->it_sched_expires = 0;
-
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, current->pid,
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 30ab39a27736..ccb04683bf18 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1285,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 #undef	UNEXPIRED
 
-	BUG_ON(tsk->exit_state);
-
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
-	spin_lock(&tsk->sighand->siglock);
+	if (likely(tsk->signal != NULL)) {
+		spin_lock(&tsk->sighand->siglock);
 
-	/*
-	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-	 * all the timers that are firing, and put them on the firing list.
-	 */
-	check_thread_timers(tsk, &firing);
-	check_process_timers(tsk, &firing);
+		/*
+		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+		 * all the timers that are firing, and put them on the firing list.
+		 */
+		check_thread_timers(tsk, &firing);
+		check_process_timers(tsk, &firing);
 
-	/*
-	 * We must release these locks before taking any timer's lock.
-	 * There is a potential race with timer deletion here, as the
-	 * siglock now protects our private firing list.  We have set
-	 * the firing flag in each timer, so that a deletion attempt
-	 * that gets the timer lock before we do will give it up and
-	 * spin until we've taken care of that timer below.
-	 */
-	spin_unlock(&tsk->sighand->siglock);
+		/*
+		 * We must release these locks before taking any timer's lock.
+		 * There is a potential race with timer deletion here, as the
+		 * siglock now protects our private firing list.  We have set
+		 * the firing flag in each timer, so that a deletion attempt
+		 * that gets the timer lock before we do will give it up and
+		 * spin until we've taken care of that timer below.
+		 */
+		spin_unlock(&tsk->sighand->siglock);
+	}
 	read_unlock(&tasklist_lock);
 
 	/*
-- 
cgit v1.2.3


From ca531a0a5e01e5122f67cb6aca8fcbfc70e18e0b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 24 Oct 2005 14:36:28 +0400
Subject: [PATCH] posix-timers: exit path cleanup

No need to rebalance when task exited

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ccb04683bf18..92a038064628 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -486,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
  	struct task_struct *t = p;
 	unsigned int nthreads = atomic_read(&p->signal->live);
 
+	if (!nthreads)
+		return;
+
 	switch (clock_idx) {
 	default:
 		BUG();
@@ -1160,6 +1163,9 @@ static void check_process_timers(struct task_struct *tsk,
 		unsigned long long sched_left, sched;
 		const unsigned int nthreads = atomic_read(&sig->live);
 
+		if (!nthreads)
+			return;
+
 		prof_left = cputime_sub(prof_expires, utime);
 		prof_left = cputime_sub(prof_left, stime);
 		prof_left = cputime_div(prof_left, nthreads);
-- 
cgit v1.2.3


From a69ac4a78d8bd9e1ec478bd7297d4f047fcd44a8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 24 Oct 2005 18:29:58 +0400
Subject: [PATCH] posix-timers: fix posix_cpu_timer_set() vs
 run_posix_cpu_timers() race

This might be harmless, but looks like a race from code inspection (I
was unable to trigger it).  I must admit, I don't understand why we
can't return TIMER_RETRY after 'spin_unlock(&p->sighand->siglock)'
without doing bump_cpu_timer(), but this is what original code does.

posix_cpu_timer_set:

	read_lock(&tasklist_lock);

	spin_lock(&p->sighand->siglock);
	list_del_init(&timer->it.cpu.entry);
	spin_unlock(&p->sighand->siglock);

We are probaly deleting the timer from run_posix_cpu_timers's 'firing'
local list_head while run_posix_cpu_timers() does list_for_each_safe.

Various bad things can happen, for example we can just delete this timer
so that list_for_each() will not notice it and run_posix_cpu_timers()
will not reset '->firing' flag. In that case,

	....

	if (timer->it.cpu.firing) {
		read_unlock(&tasklist_lock);
		timer->it.cpu.firing = -1;
		return TIMER_RETRY;
	}

sys_timer_settime() goes to 'retry:', calls posix_cpu_timer_set() again,
it returns TIMER_RETRY ...

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 92a038064628..b15462b17a58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -730,9 +730,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 * Disarm any old timer after extracting its expiry time.
 	 */
 	BUG_ON(!irqs_disabled());
+
+	ret = 0;
 	spin_lock(&p->sighand->siglock);
 	old_expires = timer->it.cpu.expires;
-	list_del_init(&timer->it.cpu.entry);
+	if (unlikely(timer->it.cpu.firing)) {
+		timer->it.cpu.firing = -1;
+		ret = TIMER_RETRY;
+	} else
+		list_del_init(&timer->it.cpu.entry);
 	spin_unlock(&p->sighand->siglock);
 
 	/*
@@ -780,7 +786,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		}
 	}
 
-	if (unlikely(timer->it.cpu.firing)) {
+	if (unlikely(ret)) {
 		/*
 		 * We are colliding with the timer actually firing.
 		 * Punt after filling in the timer's old value, and
@@ -788,8 +794,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		 * it as an overrun (thanks to bump_cpu_timer above).
 		 */
 		read_unlock(&tasklist_lock);
-		timer->it.cpu.firing = -1;
-		ret = TIMER_RETRY;
 		goto out;
 	}
 
-- 
cgit v1.2.3


From bb32051532fed727de0d513a9a578b54c0b7ea5a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 26 Oct 2005 01:59:01 -0700
Subject: [PATCH] export cpu_online_map

With CONFIG_SMP=n:

*** Warning: "cpu_online_map" [drivers/firmware/dcdbas.ko] undefined!

due to set_cpus_allowed().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1f31a528fdba..1e5cafdf4e27 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3879,6 +3879,7 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map = CPU_MASK_ALL;
+EXPORT_SYMBOL_GPL(cpu_online_map);
 cpumask_t cpu_possible_map = CPU_MASK_ALL;
 #endif
 
-- 
cgit v1.2.3


From 70ab81c2ed3d1323e7d6805bf59cbb570dff7937 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 26 Oct 2005 11:23:06 -0700
Subject: posix cpu timers: fix timer ordering

Pointed out by Oleg Nesterov, who has been walking over the code
forwards and backwards.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b15462b17a58..2f86424fa515 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -576,17 +576,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
-			if (next->expires.sched > nt->expires.sched) {
-				listpos = &next->entry;
+			if (next->expires.sched > nt->expires.sched)
 				break;
-			}
+			listpos = &next->entry;
 		}
 	} else {
 		list_for_each_entry(next, head, entry) {
-			if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
-				listpos = &next->entry;
+			if (cputime_gt(next->expires.cpu, nt->expires.cpu))
 				break;
-			}
+			listpos = &next->entry;
 		}
 	}
 	list_add(&nt->entry, listpos);
-- 
cgit v1.2.3


From 7a4ed937aa44acdeb8c6ba671509dc7b54b09d3a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 26 Oct 2005 20:26:53 +0400
Subject: [PATCH] Fix cpu timers expiration time

There's a silly off-by-one error in the code that updates the expiration
of posix CPU timers, causing them to not be properly updated when they
hit exactly on their expiration time (which should be the normal case).

This causes them to then fire immediately again, and only _then_ get
properly updated.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2f86424fa515..383ba22f0b62 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
 				  union cpu_time_count now)
 {
 	int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
 		for (i = 0; incr < delta - incr; i++)
 			incr = incr << 1;
 		for (; i >= 0; incr >>= 1, i--) {
-			if (delta <= incr)
+			if (delta < incr)
 				continue;
 			timer->it.cpu.expires.sched += incr;
 			timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
 		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
 			     incr = cputime_add(incr, incr);
 		for (; i >= 0; incr = cputime_halve(incr), i--) {
-			if (cputime_le(delta, incr))
+			if (cputime_lt(delta, incr))
 				continue;
 			timer->it.cpu.expires.cpu =
 				cputime_add(timer->it.cpu.expires.cpu, incr);
-- 
cgit v1.2.3


From a362f463a6d316d14daed0f817e151835ce97ff7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Thu, 27 Oct 2005 09:07:33 -0700
Subject: Revert "remove false BUG_ON() from run_posix_cpu_timers()"

This reverts commit 3de463c7d9d58f8cf3395268230cb20a4c15bffa.

Roland has another patch that allows us to leave the BUG_ON() in place
by just making sure that the condition it tests for really is always
true.

That goes in next.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c             |  8 ++++++++
 kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4897977a1f4b..3b25b182d2be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -825,6 +825,14 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	tsk->flags |= PF_EXITING;
 
+	/*
+	 * Make sure we don't try to process any timer firings
+	 * while we are already exiting.
+	 */
+ 	tsk->it_virt_expires = cputime_zero;
+ 	tsk->it_prof_expires = cputime_zero;
+	tsk->it_sched_expires = 0;
+
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, current->pid,
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 383ba22f0b62..ea1aca5e7c2b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1293,30 +1293,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 #undef	UNEXPIRED
 
+	BUG_ON(tsk->exit_state);
+
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
-	if (likely(tsk->signal != NULL)) {
-		spin_lock(&tsk->sighand->siglock);
+	spin_lock(&tsk->sighand->siglock);
 
-		/*
-		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-		 * all the timers that are firing, and put them on the firing list.
-		 */
-		check_thread_timers(tsk, &firing);
-		check_process_timers(tsk, &firing);
+	/*
+	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+	 * all the timers that are firing, and put them on the firing list.
+	 */
+	check_thread_timers(tsk, &firing);
+	check_process_timers(tsk, &firing);
 
-		/*
-		 * We must release these locks before taking any timer's lock.
-		 * There is a potential race with timer deletion here, as the
-		 * siglock now protects our private firing list.  We have set
-		 * the firing flag in each timer, so that a deletion attempt
-		 * that gets the timer lock before we do will give it up and
-		 * spin until we've taken care of that timer below.
-		 */
-		spin_unlock(&tsk->sighand->siglock);
-	}
+	/*
+	 * We must release these locks before taking any timer's lock.
+	 * There is a potential race with timer deletion here, as the
+	 * siglock now protects our private firing list.  We have set
+	 * the firing flag in each timer, so that a deletion attempt
+	 * that gets the timer lock before we do will give it up and
+	 * spin until we've taken care of that timer below.
+	 */
+	spin_unlock(&tsk->sighand->siglock);
 	read_unlock(&tasklist_lock);
 
 	/*
-- 
cgit v1.2.3


From 72ab373a5688a78cbdaf3bf96012e597d5399bb7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 27 Oct 2005 03:16:42 -0700
Subject: [PATCH] Yet more posix-cpu-timer fixes

This just makes sure that a thread's expiry times can't get reset after
it clears them in do_exit.

This is what allowed us to re-introduce the stricter BUG_ON() check in
a362f463a6d316d14daed0f817e151835ce97ff7.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ea1aca5e7c2b..bf374fceb39c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -497,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ticks = cputime_add(prof_ticks(t), left);
 				if (cputime_eq(t->it_prof_expires,
 					       cputime_zero) ||
@@ -512,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ticks = cputime_add(virt_ticks(t), left);
 				if (cputime_eq(t->it_virt_expires,
 					       cputime_zero) ||
@@ -527,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		nsleft = expires.sched - val.sched;
 		do_div(nsleft, nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ns = t->sched_time + nsleft;
 				if (t->it_sched_expires == 0 ||
 				    t->it_sched_expires > ns) {
@@ -566,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	struct cpu_timer_list *next;
 	unsigned long i;
 
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)	&& (p->flags & PF_EXITING))
+		return;
+
 	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1204,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
 
 			do {
 				t = next_thread(t);
-			} while (unlikely(t->exit_state));
+			} while (unlikely(t->flags & PF_EXITING));
 		} while (t != tsk);
 	}
 }
-- 
cgit v1.2.3


From 9796fdd829da626374458e8706daedcc0e432ddd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Oct 2005 03:22:03 -0400
Subject: [PATCH] gfp_t: kernel/*

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/audit.h   | 4 ++--
 include/linux/suspend.h | 2 +-
 kernel/audit.c          | 6 +++---
 kernel/auditsc.c        | 2 +-
 kernel/kexec.c          | 7 +++----
 kernel/power/swsusp.c   | 2 +-
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b2a2509bd7ea..da3c01955f3d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -260,11 +260,11 @@ extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
-extern void		    audit_log(struct audit_context *ctx, int gfp_mask,
+extern void		    audit_log(struct audit_context *ctx, gfp_t gfp_mask,
 				      int type, const char *fmt, ...)
 				      __attribute__((format(printf,4,5)));
 
-extern struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, int type);
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
 extern void		    audit_log_format(struct audit_buffer *ab,
 					     const char *fmt, ...)
 			    __attribute__((format(printf,2,3)));
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index ad15a54806d8..ba448c760168 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,7 +71,7 @@ void restore_processor_state(void);
 struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
-extern unsigned long get_usable_page(unsigned gfp_mask);
+extern unsigned long get_usable_page(gfp_t gfp_mask);
 extern void free_eaten_memory(void);
 
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/audit.c b/kernel/audit.c
index aefa73a8a586..0c56320d38dc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -133,7 +133,7 @@ struct audit_buffer {
 	struct list_head     list;
 	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
-	int		     gfp_mask;
+	gfp_t		     gfp_mask;
 };
 
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
 
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
 	struct audit_buffer	*ab	= NULL;
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-void audit_log(struct audit_context *ctx, int gfp_mask, int type, 
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
 	       const char *fmt, ...)
 {
 	struct audit_buffer *ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 88696f639aab..d8a68509e729 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	up_read(&mm->mmap_sem);
 }
 
-static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
+static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 {
 	int i;
 	struct audit_buffer *ab;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb63..36c5d9cd4cc1 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
 static int kimage_is_destination_range(struct kimage *image,
 				       unsigned long start, unsigned long end);
 static struct page *kimage_alloc_page(struct kimage *image,
-				       unsigned int gfp_mask,
+				       gfp_t gfp_mask,
 				       unsigned long dest);
 
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(unsigned int gfp_mask,
-					unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *pages;
 
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
 }
 
 static struct page *kimage_alloc_page(struct kimage *image,
-					unsigned int gfp_mask,
+					gfp_t gfp_mask,
 					unsigned long destination)
 {
 	/*
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d5c45676442..10bc5ec496d7 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1095,7 +1095,7 @@ static inline void eat_page(void *page)
 	*eaten_memory = c;
 }
 
-unsigned long get_usable_page(unsigned gfp_mask)
+unsigned long get_usable_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-- 
cgit v1.2.3


From 8d027de54c77d38eedc9b331c7a2a39807d34691 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 29 Oct 2005 19:37:40 +0400
Subject: [PATCH] fix ->signal->live leak in copy_process()

exit_signal() (called from copy_process's error path) should decrement
->signal->live, otherwise forking process will miss 'group_dead' in
do_exit().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f2b96b08fb44..6904bbbfe116 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -406,6 +406,8 @@ void __exit_signal(struct task_struct *tsk)
 
 void exit_signal(struct task_struct *tsk)
 {
+	atomic_dec(&tsk->signal->live);
+
 	write_lock_irq(&tasklist_lock);
 	__exit_signal(tsk);
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 943eae03143790c71cf42fe13529f1b74ceb0266 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 29 Oct 2005 07:32:07 +0100
Subject: [PATCH] missing exports of do_settimeofday() variants

frv, sh64, ia64 and sparc64 do not have do_settimeofday() exported (the
last two are using variant in kernel/time.c).  Exports added to match
the rest of architectures.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/time.c  | 1 +
 arch/sh64/kernel/time.c | 1 +
 kernel/time.c           | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 8d6558b00e44..f43b734482e3 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -221,6 +221,7 @@ int do_settimeofday(struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 /*
  * Scheduler clock - returns current time in nanosec units.
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index f4a62a10053c..43e395a14f49 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -253,6 +253,7 @@ int do_settimeofday(struct timespec *tv)
 
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 static int set_rtc_time(unsigned long nowtime)
 {
diff --git a/kernel/time.c b/kernel/time.c
index 40c2410ac99a..a3c2100470e1 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -532,6 +532,7 @@ int do_settimeofday (struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 void do_gettimeofday (struct timeval *tv)
 {
-- 
cgit v1.2.3


From 4b8f573b5db02a3017afbba49026a6aef480174f Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sat, 29 Oct 2005 18:15:42 -0700
Subject: [PATCH] TIMERS: add missing compensation for HZ == 250

Add missing compensation for (HZ == 250) != (1 << SHIFT_HZ) in
second_overflow().

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..6a2e5f8dc725 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -752,6 +752,15 @@ static void second_overflow(void)
     else
 	time_adj += (time_adj >> 2) + (time_adj >> 5);
 #endif
+#if HZ == 250
+    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
+     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+    else
+	time_adj += (time_adj >> 6) + (time_adj >> 7);
+#endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-- 
cgit v1.2.3


From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH] mm: vm_stat_account unshackled

The original vm_stat_account has fallen into disuse, with only one user, and
only one user of vm_stat_unaccount.  It's easier to keep track if we convert
them all to __vm_stat_account, then free it from its __shackles.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c |  3 ++-
 arch/ia64/mm/fault.c       |  2 +-
 include/linux/mm.h         | 16 ++--------------
 kernel/fork.c              |  2 +-
 mm/mmap.c                  | 20 ++++++++++----------
 mm/mprotect.c              |  4 ++--
 mm/mremap.c                |  4 ++--
 7 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d71731ee5b61..f7dfc107cb7b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	insert_vm_struct(mm, vma);
 
 	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
 
 	/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c32af910d60..f21b55549787 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -41,7 +41,7 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address)
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1649578fb0c..376a466743bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -928,26 +928,14 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
-static inline void __vm_stat_account(struct mm_struct *mm,
+static inline void vm_stat_account(struct mm_struct *mm,
 			unsigned long flags, struct file *file, long pages)
 {
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void vm_stat_account(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
-}
-
-static inline void vm_stat_unaccount(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							-vma_pages(vma));
-}
-
 /* update per process rss and vm hiwater data */
 extern void update_mem_hiwater(struct task_struct *tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 280bd44ac441..e2ff11f8c1b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -212,7 +212,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			long pages = vma_pages(mpnt);
 			mm->total_vm -= pages;
-			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
 			continue;
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e8..e1780266ac7d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -832,7 +832,7 @@ none:
 }
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 						struct file *file, long pages)
 {
 	const unsigned long stack_flags
@@ -1110,7 +1110,7 @@ munmap_back:
 	}
 out:	
 	mm->total_vm += len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
@@ -1475,7 +1475,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 	mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
@@ -1610,15 +1610,15 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  * By the time this function is called, the area struct has been
  * removed from the process mapping list.
  */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
+static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	size_t len = area->vm_end - area->vm_start;
+	long nrpages = vma_pages(vma);
 
-	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
-	if (area->vm_flags & VM_LOCKED)
-		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
-	vm_stat_unaccount(area);
-	remove_vm_struct(area);
+	mm->total_vm -= nrpages;
+	if (vma->vm_flags & VM_LOCKED)
+		mm->locked_vm -= nrpages;
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+	remove_vm_struct(vma);
 }
 
 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b305..b426f01c5e9c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -168,8 +168,8 @@ success:
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
 	change_protection(vma, start, end, newprot);
-	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
 
 fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index f343fc73a8bd..55df8f53e84d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -233,7 +233,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	 * since do_munmap() will decrement it by old_len == new_len
 	 */
 	mm->total_vm += new_len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
@@ -384,7 +384,7 @@ unsigned long do_mremap(unsigned long addr,
 				addr + new_len, vma->vm_pgoff, NULL);
 
 			current->mm->total_vm += pages;
-			__vm_stat_account(vma->vm_mm, vma->vm_flags,
+			vm_stat_account(vma->vm_mm, vma->vm_flags,
 							vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;
-- 
cgit v1.2.3


From 404351e67a9facb475abf1492245374a28d13e90 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:04 -0700
Subject: [PATCH] mm: mm_init set_mm_counters

How is anon_rss initialized?  In dup_mmap, and by mm_alloc's memset; but
that's not so good if an mm_counter_t is a special type.  And how is rss
initialized?  By set_mm_counter, all over the place.  Come on, we just need to
initialize them both at once by set_mm_counter in mm_init (which follows the
memcpy when forking).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixelf.c          | 1 -
 arch/sparc64/kernel/binfmt_aout32.c | 1 -
 arch/x86_64/ia32/ia32_aout.c        | 1 -
 fs/binfmt_aout.c                    | 1 -
 fs/binfmt_elf.c                     | 1 -
 fs/binfmt_elf_fdpic.c               | 7 -------
 fs/binfmt_flat.c                    | 1 -
 fs/binfmt_som.c                     | 1 -
 kernel/fork.c                       | 4 ++--
 9 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index 99262fe64560..7ce34d4aa220 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* Do this so that we can load the interpreter, if need be.  We will
 	 * change some of these later.
 	 */
-	set_mm_counter(current->mm, rss, 0);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index b2854ef221d0..edf52d06b280 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3e6780fa0186..93c60f4aa47a 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index dd9baabaf016..72011826f0cb 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4b15576e584..918ccc267e41 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 134c9c0d1f54..dda87c4c82a3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
 				  &interp_params,
 				  &current->mm->start_stack,
 				  &current->mm->start_brk);
-#endif
-
-	/* do this so that we can load the interpreter, if need be
-	 * - we will change some of these later
-	 */
-	set_mm_counter(current->mm, rss, 0);
 
-#ifdef CONFIG_MMU
 	retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7974efa107bc..9d6625829b99 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-		set_mm_counter(current->mm, rss, 0);
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 227a2682d2bf..00a91dc25d16 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	create_som_tables(bprm);
 
 	current->mm->start_stack = bprm->p;
-	set_mm_counter(current->mm, rss, 0);
 
 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/kernel/fork.c b/kernel/fork.c
index e2ff11f8c1b0..25caa02e2eac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -198,8 +198,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	set_mm_counter(mm, rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
@@ -323,6 +321,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
-- 
cgit v1.2.3


From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:05 -0700
Subject: [PATCH] mm: rss = file_rss + anon_rss

I was lazy when we added anon_rss, and chose to change as few places as
possible.  So currently each anonymous page has to be counted twice, in rss
and in anon_rss.  Which won't be so good if those are atomic counts in some
configurations.

Change that around: keep file_rss and anon_rss separately, and add them
together (with get_mm_rss macro) when the total is needed - reading two
atomics is much cheaper than updating two atomics.  And update anon_rss
upfront, typically in memory.c, not tucked away in page_add_anon_rmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  2 +-
 fs/proc/array.c       |  2 +-
 fs/proc/task_mmu.c    |  8 +++-----
 include/linux/sched.h |  4 +++-
 kernel/acct.c         |  2 +-
 kernel/fork.c         |  4 ++--
 mm/fremap.c           |  4 ++--
 mm/hugetlb.c          |  6 +++---
 mm/memory.c           | 31 +++++++++++++++++--------------
 mm/nommu.c            |  2 +-
 mm/rmap.c             |  8 +++-----
 mm/swapfile.c         |  2 +-
 12 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index d2208f7c87db..cefadf5ab83b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -330,7 +330,7 @@ void install_arg_page(struct vm_area_struct *vma,
 		pte_unmap(pte);
 		goto out;
 	}
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d84eecacbeaf..3e1239e4b303 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		jiffies_to_clock_t(it_real_value),
 		start_time,
 		vsize,
-		mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
+		mm ? get_mm_rss(mm) : 0,
 	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 994612bc72d0..bccee7cf9ccd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -29,7 +29,7 @@ char *task_mem(struct mm_struct *mm, char *buffer)
 		"VmPTE:\t%8lu kB\n",
 		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
+		get_mm_rss(mm) << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +44,11 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	int rss = get_mm_counter(mm, rss);
-
-	*shared = rss - get_mm_counter(mm, anon_rss);
+	*shared = get_mm_counter(mm, file_rss);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = rss;
+	*resident = *shared + get_mm_counter(mm, anon_rss);
 	return mm->total_vm;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27519df0f987..afcaac66cbd5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
+#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
@@ -286,7 +288,7 @@ struct mm_struct {
 	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
 
 	/* Special counters protected by the page_table_lock */
-	mm_counter_t _rss;
+	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..2e3f4a47e7d0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 25caa02e2eac..2048ed7b5872 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -321,7 +321,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
@@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = get_mm_counter(mm,rss);
+	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c35..fd7f2a17ff3e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 					set_page_dirty(page);
 				page_remove_rmap(page);
 				page_cache_release(page);
-				dec_mm_counter(mm, rss);
+				dec_mm_counter(mm, file_rss);
 			}
 		}
 	} else {
@@ -95,7 +95,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	zap_pte(mm, vma, addr, pte);
 
-	inc_mm_counter(mm,rss);
+	inc_mm_counter(mm, file_rss);
 	flush_icache_page(vma, page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..094455bcbbf7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -286,7 +286,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -324,7 +324,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, rss,  - (HPAGE_SIZE / PAGE_SIZE));
+		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 	flush_tlb_range(vma, start, end);
 }
@@ -386,7 +386,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				goto out;
 			}
 		}
-		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
 	}
 out:
diff --git a/mm/memory.c b/mm/memory.c
index 51eb38574830..59d42e50fa53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -397,9 +397,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	get_page(page);
-	inc_mm_counter(dst_mm, rss);
 	if (PageAnon(page))
 		inc_mm_counter(dst_mm, anon_rss);
+	else
+		inc_mm_counter(dst_mm, file_rss);
 	set_pte_at(dst_mm, addr, dst_pte, pte);
 	page_dup_rmap(page);
 }
@@ -581,8 +582,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 					set_page_dirty(page);
 				if (pte_young(ptent))
 					mark_page_accessed(page);
+				dec_mm_counter(tlb->mm, file_rss);
 			}
-			dec_mm_counter(tlb->mm, rss);
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
 			continue;
@@ -1290,13 +1291,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		if (PageAnon(old_page))
-			dec_mm_counter(mm, anon_rss);
 		if (PageReserved(old_page))
-			inc_mm_counter(mm, rss);
-		else
+			inc_mm_counter(mm, anon_rss);
+		else {
 			page_remove_rmap(old_page);
-
+			if (!PageAnon(old_page)) {
+				inc_mm_counter(mm, anon_rss);
+				dec_mm_counter(mm, file_rss);
+			}
+		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1701,7 +1704,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	/* The page isn't present yet, go ahead with the fault. */
 
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1774,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_release(page);
 			goto unlock;
 		}
-		inc_mm_counter(mm, rss);
+		inc_mm_counter(mm, anon_rss);
 		entry = mk_pte(page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		lru_cache_add_active(page);
@@ -1887,19 +1890,19 @@ retry:
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
-		if (!PageReserved(new_page))
-			inc_mm_counter(mm, rss);
-
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
+			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else
+		} else if (!PageReserved(new_page)) {
+			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
 		page_cache_release(new_page);
@@ -2192,7 +2195,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 void update_mem_hiwater(struct task_struct *tsk)
 {
 	if (tsk->mm) {
-		unsigned long rss = get_mm_counter(tsk->mm, rss);
+		unsigned long rss = get_mm_rss(tsk->mm);
 
 		if (tsk->mm->hiwater_rss < rss)
 			tsk->mm->hiwater_rss = rss;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0ef241ae3763..599924886eb5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1083,7 +1083,7 @@ void update_mem_hiwater(struct task_struct *tsk)
 	unsigned long rss;
 
 	if (likely(tsk->mm)) {
-		rss = get_mm_counter(tsk->mm, rss);
+		rss = get_mm_rss(tsk->mm);
 		if (tsk->mm->hiwater_rss < rss)
 			tsk->mm->hiwater_rss = rss;
 		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
diff --git a/mm/rmap.c b/mm/rmap.c
index 1fc559e09ca8..504757624cce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -445,8 +445,6 @@ void page_add_anon_rmap(struct page *page,
 {
 	BUG_ON(PageReserved(page));
 
-	inc_mm_counter(vma->vm_mm, anon_rss);
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -561,9 +559,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 		dec_mm_counter(mm, anon_rss);
-	}
+	} else
+		dec_mm_counter(mm, file_rss);
 
-	dec_mm_counter(mm, rss);
 	page_remove_rmap(page);
 	page_cache_release(page);
 
@@ -667,7 +665,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, rss);
+		dec_mm_counter(mm, file_rss);
 		(*mapcount)--;
 	}
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 05c851291241..296e0bbf7836 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -407,7 +407,7 @@ void free_swap_and_cache(swp_entry_t entry)
 static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
-	inc_mm_counter(vma->vm_mm, rss);
+	inc_mm_counter(vma->vm_mm, anon_rss);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-- 
cgit v1.2.3


From fd3e42fcc888a773572282575d2fdbf5cfd6216e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:06 -0700
Subject: [PATCH] mm: dup_mmap use oldmm more

Use the parent's oldmm throughout dup_mmap, instead of perversely going back
to current->mm.  (Can you hear the sigh of relief from those mpnts?  Usually I
squash them, but not today.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2048ed7b5872..0e7fe4a8a8df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,16 +182,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 }
 
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct * mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
-	flush_cache_mm(current->mm);
+	flush_cache_mm(oldmm);
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -204,7 +204,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -265,7 +265,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, current->mm, tmp);
+		retval = copy_page_range(mm, oldmm, tmp);
 		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
@@ -277,7 +277,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	retval = 0;
 
 out:
-	flush_tlb_mm(current->mm);
+	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
 fail_nomem_policy:
-- 
cgit v1.2.3


From 7ee78232501ea9de2b6c8f10d32c9a0fee541357 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:08 -0700
Subject: [PATCH] mm: dup_mmap down new mmap_sem

One anomaly remains from when Andrea rationalized the responsibilities of
mmap_sem and page_table_lock: in dup_mmap we add vmas to the child holding its
page_table_lock, but not the mmap_sem which normally guards the vma list and
rbtree.  Which could be an issue for unuse_mm: though since it just walks down
the list (today with page_table_lock, tomorrow not), it's probably okay.  Will
need a memory barrier?  Oh, keep it simple, Nick and I agreed, no harm in
taking child's mmap_sem here.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e7fe4a8a8df..2a587b3224e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -192,6 +192,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	down_write(&oldmm->mmap_sem);
 	flush_cache_mm(oldmm);
+	down_write(&mm->mmap_sem);
+
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -251,10 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 
 		/*
-		 * Link in the new vma and copy the page table entries:
-		 * link in first so that swapoff can see swap entries.
-		 * Note that, exceptionally, here the vma is inserted
-		 * without holding mm->mmap_sem.
+		 * Link in the new vma and copy the page table entries.
 		 */
 		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
@@ -275,8 +274,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto out;
 	}
 	retval = 0;
-
 out:
+	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
-- 
cgit v1.2.3


From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 29 Oct 2005 18:16:12 -0700
Subject: [PATCH] core remove PageReserved

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearing of PageReserved is retained, and it is now flagged
in the page_alloc checks to help ensure we don't introduce any refcount
based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated.  We never completely handled it correctly anyway, and is be
reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and all
arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can
be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not.  This still
needs to be addressed (a generic page_is_ram() should work).

A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and
thus mapcounted and count towards shared rss).  These writes to the struct
page could cause excessive cacheline bouncing on big systems.  There are a
number of ways this could be addressed if it is an issue.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Refcount bug fix for filemap_xip.c

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/vdso.c  |  12 +++--
 arch/sparc/mm/generic.c   |   3 ++
 arch/sparc64/mm/generic.c |   3 ++
 drivers/scsi/sg.c         |  12 +++--
 drivers/scsi/st.c         |  10 ++--
 fs/direct-io.c            |   4 +-
 include/linux/mm.h        |   5 +-
 kernel/power/swsusp.c     |  25 +++++----
 mm/bootmem.c              |   1 +
 mm/filemap_xip.c          |  11 ++--
 mm/fremap.c               |  23 ++++----
 mm/madvise.c              |   2 +-
 mm/memory.c               | 131 ++++++++++++++++++++++++++++------------------
 mm/mempolicy.c            |  29 +++++-----
 mm/mmap.c                 |  11 ++++
 mm/mprotect.c             |   8 +++
 mm/msync.c                |  17 +++---
 mm/page_alloc.c           |  14 ++---
 mm/rmap.c                 |  14 ++---
 mm/shmem.c                |   4 +-
 mm/swap.c                 |   4 +-
 sound/core/pcm_native.c   |   9 ++--
 22 files changed, 218 insertions(+), 134 deletions(-)

(limited to 'kernel')

diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c
index efa985f05aca..4aacf521e3e4 100644
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
 		return NOPAGE_SIGBUS;
 
 	/*
-	 * Last page is systemcfg, special handling here, no get_page() a
-	 * this is a reserved page
+	 * Last page is systemcfg.
 	 */
 	if ((vma->vm_end - address) <= PAGE_SIZE)
-		return virt_to_page(systemcfg);
+		pg = virt_to_page(systemcfg);
+	else
+		pg = virt_to_page(vbase + offset);
 
-	pg = virt_to_page(vbase + offset);
 	get_page(pg);
 	DBG(" ->page count: %d\n", page_count(pg));
 
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
 	 * gettimeofday will be totally dead. It's fine to use that for setting
 	 * breakpoints in the vDSO code pages though
 	 */
-	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
 	vma->vm_flags |= mm->def_flags;
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 	vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
 		ClearPageReserved(pg);
 		get_page(pg);
 	}
+
+	get_page(virt_to_page(systemcfg));
 }
 
 int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 20ccb957fb77..659c9a71f867 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index c954d91f01d0..afc01cec701f 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 861e51375d70..2d30b46806bf 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
-		/* unlock_page(sgl[i].page); */
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
+		/* unlock_page(page); */
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5eb54d8019b4..da9766283bd7 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc995..3931e7f1e6bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 	up_read(&current->mm->mmap_sem);
 
 	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+		page_cache_get(page);
+		dio->pages[0] = page;
 		dio->head = 0;
 		dio->tail = 1;
 		ret = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c64484d8ae0..da42093250c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
+#define VM_RESERVED	0x00080000	/* Pages managed in a special way */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
 
 static inline void put_page(struct page *page)
 {
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 10bc5ec496d7..016504ccfccf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
 			continue;
 		page = pfn_to_page(pfn);
 		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
+		 * PageReserved results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations.
+		 *
+		 * rvmalloc should not cause this, because all implementations
+		 * appear to always be using vmalloc_32 on architectures with
+		 * highmem. This is a good thing, because we would like to save
+		 * rvmalloc pages.
+		 *
+		 * It appears to be triggered by pages which do not point to
+		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+		 * which sets PageReserved if the page does not point to valid
+		 * RAM.
+		 *
+		 * XXX: must remove usage of PageReserved!
 		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
+		if (PageReserved(page))
 			continue;
-		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+	if (pfn_is_nosave(pfn)) {
 		pr_debug("[nosave pfn 0x%lx]", pfn);
 		return 0;
 	}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 				if (j + 16 < BITS_PER_LONG)
 					prefetchw(page + j + 16);
 				__ClearPageReserved(page + j);
+				set_page_count(page + j, 0);
 			}
 			__free_pages(page, order);
 			i += BITS_PER_LONG;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9354ee279b13 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	struct page *page = ZERO_PAGE(address);
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping,
 		 * We need the page_table_lock to protect us from page faults,
 		 * munmap, fork, etc...
 		 */
-		pte = page_check_address(ZERO_PAGE(address), mm,
-					 address);
+		pte = page_check_address(page, mm, address);
 		if (!IS_ERR(pte)) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
+			page_remove_rmap(page);
+			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap(pte);
 			spin_unlock(&mm->page_table_lock);
+			page_cache_release(page);
 		}
 	}
 	spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area,
 
 	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page)) {
-		return page;
+		goto out;
 	}
 	if (PTR_ERR(page) != -ENODATA)
 		return NULL;
@@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area,
 		page = ZERO_PAGE(address);
 	}
 
+out:
+	page_cache_get(page);
 	return page;
 }
 
diff --git a/mm/fremap.c b/mm/fremap.c
index fd7f2a17ff3e..224cc1598b35 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 		return;
 	if (pte_present(pte)) {
 		unsigned long pfn = pte_pfn(pte);
+		struct page *page;
 
 		flush_cache_page(vma, addr, pfn);
 		pte = ptep_clear_flush(vma, addr, ptep);
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			if (!PageReserved(page)) {
-				if (pte_dirty(pte))
-					set_page_dirty(page);
-				page_remove_rmap(page);
-				page_cache_release(page);
-				dec_mm_counter(mm, file_rss);
-			}
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, pte, addr);
+			return;
 		}
+		page = pfn_to_page(pfn);
+		if (pte_dirty(pte))
+			set_page_dirty(page);
+		page_remove_rmap(page);
+		page_cache_release(page);
+		dec_mm_counter(mm, file_rss);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
@@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgd_t *pgd;
 	pte_t pte_val;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 	
@@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgd_t *pgd;
 	pte_t pte_val;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 	
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index da642b5528fa..e83f9440bb66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -342,6 +342,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 #define NO_RSS 2	/* Increment neither file_rss nor anon_rss */
 
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+			"vm_flags = %lx, vaddr = %lx\n",
+		(long long)pte_val(pte),
+		(vma->vm_mm == current->mm ? current->comm : "???"),
+		vma->vm_flags, vaddr);
+	dump_stack();
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 static inline int
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
 		unsigned long addr)
 {
+	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	unsigned long pfn;
@@ -375,18 +393,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
+	/* If the region is VM_RESERVED, the mapping is not
+	 * mapped via rmap - duplicate the pte as is.
+	 */
+	if (vm_flags & VM_RESERVED)
+		goto out_set_pte;
+
 	pfn = pte_pfn(pte);
-	/* the pte points outside of valid memory, the
-	 * mapping is assumed to be good, meaningful
-	 * and not mapped via rmap - duplicate the
-	 * mapping as is.
+	/* If the pte points outside of valid memory but
+	 * the region is not VM_RESERVED, we have a problem.
 	 */
-	page = NULL;
-	if (pfn_valid(pfn))
-		page = pfn_to_page(pfn);
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		goto out_set_pte; /* try to do something sane */
+	}
 
-	if (!page || PageReserved(page))
-		goto out_set_pte;
+	page = pfn_to_page(pfn);
 
 	/*
 	 * If it's a COW mapping, write protect it both
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
-	unsigned long vm_flags = vma->vm_flags;
 	int progress = 0;
 	int rss[NO_RSS+1], anon;
 
@@ -446,8 +467,7 @@ again:
 			progress++;
 			continue;
 		}
-		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
-							vm_flags, addr);
+		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
 		rss[anon]++;
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
+	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	int file_rss = 0;
 	int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
-			unsigned long pfn = pte_pfn(ptent);
-			if (pfn_valid(pfn)) {
-				page = pfn_to_page(pfn);
-				if (PageReserved(page))
-					page = NULL;
+			if (!(vma->vm_flags & VM_RESERVED)) {
+				unsigned long pfn = pte_pfn(ptent);
+				if (unlikely(!pfn_valid(pfn)))
+					print_bad_pte(vma, ptent, addr);
+				else
+					page = pfn_to_page(pfn);
 			}
 			if (unlikely(details) && page) {
 				/*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			if (unlikely(details) && details->nonlinear_vma
 			    && linear_page_index(details->nonlinear_vma,
 						addr) != page->index)
-				set_pte_at(tlb->mm, addr, pte,
+				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
 				anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+		pte_clear_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	add_mm_rss(tlb->mm, -file_rss, -anon_rss);
+	add_mm_rss(mm, -file_rss, -anon_rss);
 	pte_unmap(pte - 1);
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		zap_pte_range(tlb, pmd, addr, next, details);
+		zap_pte_range(tlb, vma, pmd, addr, next, details);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		zap_pmd_range(tlb, pud, addr, next, details);
+		zap_pmd_range(tlb, vma, pud, addr, next, details);
 	} while (pud++, addr = next, addr != end);
 }
 
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		zap_pud_range(tlb, pgd, addr, next, details);
+		zap_pud_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 }
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & VM_IO)
+		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
-				if (!PageReserved(page))
-					page_cache_get(page);
+				page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	if (!pte)
 		return -ENOMEM;
 	do {
-		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+		struct page *page = ZERO_PAGE(addr);
+		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+		page_cache_get(page);
+		page_add_file_rmap(page);
+		inc_mm_counter(mm, file_rss);
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		return -ENOMEM;
 	do {
 		BUG_ON(!pte_none(*pte));
-		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-			set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED tells swapout not to try to touch
-	 *	this region.
+	 *   VM_RESERVED tells the core MM not to "manage" these pages
+         *	(e.g. refcount, mapcount, try to swap them out).
 	 */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_ERROR(orig_pte);
+		print_bad_pte(vma, orig_pte, address);
 		ret = VM_FAULT_OOM;
 		goto unlock;
 	}
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-	if (!PageReserved(old_page))
-		page_cache_get(old_page);
+	page_cache_get(old_page);
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		if (PageReserved(old_page))
+		page_remove_rmap(old_page);
+		if (!PageAnon(old_page)) {
 			inc_mm_counter(mm, anon_rss);
-		else {
-			page_remove_rmap(old_page);
-			if (!PageAnon(old_page)) {
-				inc_mm_counter(mm, anon_rss);
-				dec_mm_counter(mm, file_rss);
-			}
+			dec_mm_counter(mm, file_rss);
 		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access)
 {
+	struct page *page = ZERO_PAGE(addr);
 	pte_t entry;
 
 	/* Mapping of ZERO_PAGE - vm_page_prot is readonly */
-	entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
+	entry = mk_pte(page, vma->vm_page_prot);
 
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
 		page_add_anon_rmap(page, vma, address);
+	} else {
+		inc_mm_counter(mm, file_rss);
+		page_add_file_rmap(page);
+		page_cache_get(page);
 	}
 
 	set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!PageReserved(new_page)) {
+		} else if (!(vma->vm_flags & VM_RESERVED)) {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 		}
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_ERROR(orig_pte);
+		print_bad_pte(vma, orig_pte, address);
 		return VM_FAULT_OOM;
 	}
 	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_page_prot = PAGE_READONLY;
-	gate_vma.vm_flags = 0;
+	gate_vma.vm_flags = VM_RESERVED;
 	return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 43b1199af591..11d824f282f1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 }
 
 /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	spin_lock(&mm->page_table_lock);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	orig_pte = pte = pte_offset_map(pmd, addr);
 	do {
 		unsigned long pfn;
@@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		if (!pte_present(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (!pfn_valid(pfn)) {
+			print_bad_pte(vma, *pte, addr);
 			continue;
+		}
 		nid = pfn_to_nid(pfn);
 		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(orig_pte);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return addr != end;
 }
 
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pmd_t *pmd;
@@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(mm, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pud_t *pud;
@@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(mm, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pgd_range(struct mm_struct *mm,
+static inline int check_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pgd_t *pgd;
 	unsigned long next;
 
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(mm, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
@@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
+	if (first->vm_flags & VM_RESERVED)
+		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		if (!vma->vm_next && vma->vm_end < end)
@@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma->vm_mm,
-					   start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
diff --git a/mm/mmap.c b/mm/mmap.c
index 459b9f068ad7..8a111792b8db 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1088,6 +1088,17 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
+		if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+						== (VM_WRITE | VM_RESERVED)) {
+			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+				"PROT_WRITE mmap of VM_RESERVED memory, which "
+				"is deprecated. Please report this to "
+				"linux-kernel@vger.kernel.org\n",current->comm);
+			if (vma->vm_ops && vma->vm_ops->close)
+				vma->vm_ops->close(vma);
+			error = -EACCES;
+			goto unmap_and_free_vma;
+		}
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
 		if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b426f01c5e9c..672a76fddd5e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
+		if (oldflags & VM_RESERVED) {
+			BUG_ON(oldflags & VM_WRITE);
+			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+				"PROT_WRITE mprotect of VM_RESERVED memory, "
+				"which is deprecated. Please report this to "
+				"linux-kernel@vger.kernel.org\n",current->comm);
+			return -EACCES;
+		}
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
diff --git a/mm/msync.c b/mm/msync.c
index 3b5f1c521d4b..860395486060 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -25,6 +25,7 @@
 static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte;
 	int progress = 0;
 
@@ -37,7 +38,7 @@ again:
 		if (progress >= 64) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(&vma->vm_mm->page_table_lock))
+			    need_lockbreak(&mm->page_table_lock))
 				break;
 		}
 		progress++;
@@ -46,11 +47,11 @@ again:
 		if (!pte_maybe_dirty(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, *pte, addr);
 			continue;
+		}
 		page = pfn_to_page(pfn);
-		if (PageReserved(page))
-			continue;
 
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
@@ -58,7 +59,7 @@ again:
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
-	cond_resched_lock(&vma->vm_mm->page_table_lock);
+	cond_resched_lock(&mm->page_table_lock);
 	if (addr != end)
 		goto again;
 }
@@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma,
 
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
-	 * to do anything more on an msync() */
-	if (is_vm_hugetlb_page(vma))
+	 * to do anything more on an msync().
+	 * Can't do anything with VM_RESERVED regions either.
+	 */
+	if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
 		return;
 
 	BUG_ON(addr >= end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60663232fbb2..0541288ebf4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback);
+			1 << PG_writeback |
+			1 << PG_reserved );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order)
 {
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
-           !PageReserved(page)         &&
             page_count(page) == 0)
                return 1;
        return 0;
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
-			1 << PG_writeback )))
+			1 << PG_writeback |
+			1 << PG_reserved )))
 		bad_page(function, page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order)
 			1 << PG_reclaim	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback )))
+			1 << PG_writeback |
+			1 << PG_reserved )))
 		bad_page(__FUNCTION__, page);
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
 
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
-	if (!PageReserved(page) && put_page_testzero(page)) {
+	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
@@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 0);
+		set_page_count(page, 1);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
diff --git a/mm/rmap.c b/mm/rmap.c
index 504757624cce..f69d5342ce7f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	BUG_ON(PageReserved(page));
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	BUG_ON(PageAnon(page));
-	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
-		return;
+	BUG_ON(!pfn_valid(page_to_pfn(page)));
 
 	if (atomic_inc_and_test(&page->_mapcount))
 		inc_page_state(nr_mapped);
@@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
-	BUG_ON(PageReserved(page));
-
 	if (atomic_add_negative(-1, &page->_mapcount)) {
 		BUG_ON(page_mapcount(page) < 0);
 		/*
@@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
 			continue;
 
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, *pte, address);
 			continue;
+		}
 
 		page = pfn_to_page(pfn);
 		BUG_ON(PageAnon(page));
-		if (PageReserved(page))
-			continue;
 
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
@@ -808,7 +803,6 @@ int try_to_unmap(struct page *page)
 {
 	int ret;
 
-	BUG_ON(PageReserved(page));
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index 6796311a23ef..37777f4c11f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 			 */
 			if (!offset)
 				mark_page_accessed(page);
-		} else
+		} else {
 			page = ZERO_PAGE(0);
+			page_cache_get(page);
+		}
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..21d15f99805c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -48,7 +48,7 @@ void put_page(struct page *page)
 		}
 		return;
 	}
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
 		struct page *page = pages[i];
 		struct zone *pagezone;
 
-		if (PageReserved(page) || !put_page_testzero(page))
+		if (!put_page_testzero(page))
 			continue;
 
 		pagezone = page_zone(page);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 67abebabf83e..e97b2d162cc7 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->status);
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->control);
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
 		vaddr = runtime->dma_area + offset;
 		page = virt_to_page(vaddr);
 	}
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
-- 
cgit v1.2.3


From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:18 -0700
Subject: [PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c           |  1 -
 fs/exec.c             |  1 -
 fs/proc/task_mmu.c    | 23 +++++++++++++++++++++--
 include/linux/mm.h    |  3 ---
 include/linux/sched.h | 10 ++++++++++
 kernel/exit.c         |  5 ++++-
 kernel/sched.c        |  2 --
 mm/fremap.c           |  4 +++-
 mm/hugetlb.c          |  3 +++
 mm/memory.c           | 17 +----------------
 mm/mmap.c             |  4 ++++
 mm/mremap.c           | 12 ++++++++++--
 mm/nommu.c            | 15 ++-------------
 mm/rmap.c             |  6 ++++++
 14 files changed, 64 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index a719e158e002..8e71cdbecc7c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index cefadf5ab83b..9bb55c8cf224 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1207,7 +1207,6 @@ int do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bccee7cf9ccd..7c89b4549049 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
 
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
+		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n",
-		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		hiwater_vm << (PAGE_SHIFT-10),
+		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_rss(mm) << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da42093250c3..7d4552fe0864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afcaac66cbd5..a9c0b7d26303 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define dec_mm_counter(mm, member) (mm)->_##member--
 #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
 
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..79f52b85d6ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	update_mem_hiwater(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e27..4f26c544d02c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
-	/* Update rss highwater mark */
-	update_mem_hiwater(p);
 }
 
 /*
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f08d10ceaff..49719a35769a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte)
 		goto err_unlock;
 
-	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
+	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+		update_hiwater_rss(mm);
 		dec_mm_counter(mm, file_rss);
+	}
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 094455bcbbf7..ac5f044bf514 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (! ptep)
diff --git a/mm/memory.c b/mm/memory.c
index a25ee1d3e20a..692ad810263d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
 	tlb_finish_mmu(tlb, address, end);
 	spin_unlock(&mm->page_table_lock);
@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
 
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-/*
- * update_mem_hiwater
- *	- update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	if (tsk->mm) {
-		unsigned long rss = get_mm_rss(tsk->mm);
-
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
-
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mmap.c b/mm/mmap.c
index 8a111792b8db..c43b28457007 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
+	/* Update high watermark before we lower total_vm */
+	update_hiwater_vm(mm);
 	do {
 		long nrpages = vma_pages(vma);
 
@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm,
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
+	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
diff --git a/mm/mremap.c b/mm/mremap.c
index 318eea5467a0..ccf456477020 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long new_pgoff;
 	unsigned long moved_len;
 	unsigned long excess = 0;
+	unsigned long hiwater_vm;
 	int split = 0;
 
 	/*
@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	/*
-	 * if we failed to move page tables we still do total_vm increment
-	 * since do_munmap() will decrement it by old_len == new_len
+	 * If we failed to move page tables we still do total_vm increment
+	 * since do_munmap() will decrement it by old_len == new_len.
+	 *
+	 * Since total_vm is about to be raised artificially high for a
+	 * moment, we need to restore high watermark afterwards: if stats
+	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
+	 * If this were a serious issue, we'd add a flag to do_munmap().
 	 */
+	hiwater_vm = mm->hiwater_vm;
 	mm->total_vm += new_len >> PAGE_SHIFT;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 599924886eb5..dfb124ffb9be 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 	realalloc -= kobjsize(vml);
 	askedalloc -= sizeof(*vml);
 	kfree(vml);
+
+	update_hiwater_vm(mm);
 	mm->total_vm -= len >> PAGE_SHIFT;
 
 #ifdef DEBUG
@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	unsigned long rss;
-
-	if (likely(tsk->mm)) {
-		rss = get_mm_rss(tsk->mm);
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
-
 void unmap_mapping_range(struct address_space *mapping,
 			 loff_t const holebegin, loff_t const holelen,
 			 int even_cows)
diff --git a/mm/rmap.c b/mm/rmap.c
index f69d5342ce7f..4c52c56c9905 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page->private };
 		/*
@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	if (!pmd_present(*pmd))
 		goto out_unlock;
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (original_pte = pte = pte_offset_map(pmd, address);
 			address < end; pte++, address += PAGE_SIZE) {
 
-- 
cgit v1.2.3


From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:23 -0700
Subject: [PATCH] mm: ptd_alloc take ptlock

Second step in pushing down the page_table_lock.  Remove the temporary
bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not
to hold page_table_lock, whether it's on init_mm or a user mm; take
page_table_lock internally to check if a racing task already allocated.

Convert their callers from common code.  But avoid coming back to change them
again later: instead of moving the spin_lock(&mm->page_table_lock) down,
switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which
encapsulate the mapping+locking and unlocking+unmapping together, and in the
end may use alternatives to the mm page_table_lock itself.

These callers all hold mmap_sem (some exclusively, some not), so at no level
can a page table be whipped away from beneath them; and pte_alloc uses the
"atomic" pmd_present to test whether it needs to allocate.  It appears that on
all arches we can safely descend without page_table_lock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c          |  14 +++-----
 include/linux/mm.h |  18 ++++++++++
 kernel/fork.c      |   2 --
 mm/fremap.c        |  48 ++++++++++---------------
 mm/hugetlb.c       |  12 ++++---
 mm/memory.c        | 104 +++++++++++++++++------------------------------------
 mm/mremap.c        |  27 +++++---------
 7 files changed, 90 insertions(+), 135 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 9bb55c8cf224..ba73797eb4cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_struct *vma,
 	pud_t * pud;
 	pmd_t * pmd;
 	pte_t * pte;
+	spinlock_t *ptl;
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto out_sig;
+		goto out;
 
 	flush_dcache_page(page);
 	pgd = pgd_offset(mm, address);
-
-	spin_lock(&mm->page_table_lock);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
 	if (!pte)
 		goto out;
 	if (!pte_none(*pte)) {
-		pte_unmap(pte);
+		pte_unmap_unlock(pte, ptl);
 		goto out;
 	}
 	inc_mm_counter(mm, anon_rss);
@@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_struct *vma,
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
 	return;
 out:
-	spin_unlock(&mm->page_table_lock);
-out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, current);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22c2d6922c0e..d4c3512e7db4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#define pte_offset_map_lock(mm, pmd, address, ptlp)	\
+({							\
+	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	pte_t *__pte = pte_offset_map(pmd, address);	\
+	*(ptlp) = __ptl;				\
+	spin_lock(__ptl);				\
+	__pte;						\
+})
+
+#define pte_unmap_unlock(pte, ptl)	do {		\
+	spin_unlock(ptl);				\
+	pte_unmap(pte);					\
+} while (0)
+
 #define pte_alloc_map(mm, pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
 		NULL: pte_offset_map(pmd, address))
 
+#define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
+
 #define pte_alloc_kernel(pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a587b3224e3..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
-		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, tmp);
-		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
diff --git a/mm/fremap.c b/mm/fremap.c
index 49719a35769a..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -63,23 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
+	spinlock_t *ptl;
 
 	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
-	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto err_unlock;
-
+		goto out;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto err_unlock;
-
-	pte = pte_alloc_map(mm, pmd, addr);
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
-		goto err_unlock;
+		goto out;
 
 	/*
 	 * This page may have been truncated. Tell the
@@ -89,10 +86,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inode = vma->vm_file->f_mapping->host;
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (!page->mapping || page->index >= size)
-		goto err_unlock;
+		goto unlock;
 	err = -ENOMEM;
 	if (page_mapcount(page) > INT_MAX/2)
-		goto err_unlock;
+		goto unlock;
 
 	if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
 		inc_mm_counter(mm, file_rss);
@@ -101,17 +98,15 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
 	pte_val = *pte;
-	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-
 	err = 0;
-err_unlock:
-	spin_unlock(&mm->page_table_lock);
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
 	return err;
 }
 EXPORT_SYMBOL(install_page);
 
-
 /*
  * Install a file pte to a given virtual memory address, release any
  * previously existing mapping.
@@ -125,23 +120,20 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
+	spinlock_t *ptl;
 
 	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
-	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto err_unlock;
-
+		goto out;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto err_unlock;
-
-	pte = pte_alloc_map(mm, pmd, addr);
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
-		goto err_unlock;
+		goto out;
 
 	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
 		update_hiwater_rss(mm);
@@ -150,17 +142,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
-	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-	spin_unlock(&mm->page_table_lock);
-	return 0;
-
-err_unlock:
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
+	err = 0;
+out:
 	return err;
 }
 
-
 /***
  * sys_remap_file_pages - remap arbitrary pages of a shared backing store
  *                        file within an existing vma.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac5f044bf514..ea0826ff2663 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,12 +277,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long addr;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		src_pte = huge_pte_offset(src, addr);
+		if (!src_pte)
+			continue;
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
 			goto nomem;
+		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
-		src_pte = huge_pte_offset(src, addr);
-		if (src_pte && !pte_none(*src_pte)) {
+		if (!pte_none(*src_pte)) {
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@@ -290,6 +293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
+		spin_unlock(&dst->page_table_lock);
 	}
 	return 0;
 
@@ -354,7 +358,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 
 	hugetlb_prefault_arch_hook(mm);
 
-	spin_lock(&mm->page_table_lock);
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
 		pte_t *pte = huge_pte_alloc(mm, addr);
@@ -389,11 +392,12 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				goto out;
 			}
 		}
+		spin_lock(&mm->page_table_lock);
 		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+		spin_unlock(&mm->page_table_lock);
 	}
 out:
-	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 4bdd1186b43b..a40e4b1cee4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
-	struct page *new;
-
-	spin_unlock(&mm->page_table_lock);
-	new = pte_alloc_one(mm, address);
-	spin_lock(&mm->page_table_lock);
+	struct page *new = pte_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 
+	spin_lock(&mm->page_table_lock);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free(new);
 	else {
@@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
 	}
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
@@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
  */
 
 static inline void
@@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
+	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
 	int rss[2];
 
 again:
 	rss[1] = rss[0] = 0;
-	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
+	src_ptl = &src_mm->page_table_lock;
+	spin_lock(src_ptl);
 
-	spin_lock(&src_mm->page_table_lock);
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
@@ -438,8 +435,8 @@ again:
 		if (progress >= 32) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(&src_mm->page_table_lock) ||
-			    need_lockbreak(&dst_mm->page_table_lock))
+			    need_lockbreak(src_ptl) ||
+			    need_lockbreak(dst_ptl))
 				break;
 		}
 		if (pte_none(*src_pte)) {
@@ -449,12 +446,12 @@ again:
 		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
-	spin_unlock(&src_mm->page_table_lock);
 
+	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
-	pte_unmap(dst_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
-	cond_resched_lock(&dst_mm->page_table_lock);
+	pte_unmap_unlock(dst_pte - 1, dst_ptl);
+	cond_resched();
 	if (addr != end)
 		goto again;
 	return 0;
@@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end, pgprot_t prot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_alloc_map(mm, pmd, addr);
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 
@@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = zeromap_pud_range(mm, pgd, addr, next, prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 
@@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long pfn, pgprot_t prot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_alloc_map(mm, pmd, addr);
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 
@@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_pud_range(mm, pgd, addr, next,
@@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
@@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 {
 	pte_t entry;
 
+	spin_lock(&mm->page_table_lock);
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
@@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
 
-	/*
-	 * We need the page table lock to synchronize with kswapd
-	 * and the SMP-safe atomic PTE updates.
-	 */
 	pgd = pgd_offset(mm, address);
-	spin_lock(&mm->page_table_lock);
-
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
-		goto oom;
-
+		return VM_FAULT_OOM;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
-		goto oom;
-
+		return VM_FAULT_OOM;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
-		goto oom;
-	
-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+		return VM_FAULT_OOM;
 
- oom:
-	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_OOM;
+	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
@@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	pud_t *new;
-
-	if (mm != &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
-	new = pud_alloc_one(mm, address);
-	if (!new) {
-		if (mm != &init_mm)	/* Temporary bridging hack */
-			spin_lock(&mm->page_table_lock);
+	pud_t *new = pud_alloc_one(mm, address);
+	if (!new)
 		return -ENOMEM;
-	}
 
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
 	else
 		pgd_populate(mm, pgd, new);
-	if (mm == &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
@@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	pmd_t *new;
-
-	if (mm != &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
-	new = pmd_alloc_one(mm, address);
-	if (!new) {
-		if (mm != &init_mm)	/* Temporary bridging hack */
-			spin_lock(&mm->page_table_lock);
+	pmd_t *new = pmd_alloc_one(mm, address);
+	if (!new)
 		return -ENOMEM;
-	}
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
@@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	if (mm == &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/mremap.c b/mm/mremap.c
index 616facc3d28a..8de77b632a20 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -28,9 +28,6 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 	pud_t *pud;
 	pmd_t *pmd;
 
-	/*
-	 * We don't need page_table_lock: we have mmap_sem exclusively.
-	 */
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none_or_clear_bad(pgd))
 		return NULL;
@@ -50,25 +47,20 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
+	pmd_t *pmd;
 
-	/*
-	 * We do need page_table_lock: because allocators expect that.
-	 */
-	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto out;
+		return NULL;
 
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto out;
+		return NULL;
 
 	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
-		pmd = NULL;
-out:
-	spin_unlock(&mm->page_table_lock);
+		return NULL;
+
 	return pmd;
 }
 
@@ -80,6 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
+	spinlock_t *old_ptl;
 
 	if (vma->vm_file) {
 		/*
@@ -95,9 +88,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			new_vma->vm_truncate_count = 0;
 	}
 
-	spin_lock(&mm->page_table_lock);
-	old_pte = pte_offset_map(old_pmd, old_addr);
-	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -110,8 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	}
 
 	pte_unmap_nested(new_pte - 1);
-	pte_unmap(old_pte - 1);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 }
-- 
cgit v1.2.3


From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:33 -0700
Subject: [PATCH] mm: follow_page with inner ptlock

Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c |   3 +-
 include/linux/mm.h |  20 ++++---
 kernel/futex.c     |   6 +--
 mm/memory.c        | 152 +++++++++++++++++++++++++----------------------------
 mm/nommu.c         |   3 +-
 5 files changed, 88 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7e5e7ec2e36d..d2fa42006d8f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 	for_each_node(i)
 		md->node[i] =0;
 
-	spin_lock(&mm->page_table_lock);
  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
 		page = follow_page(mm, vaddr, 0);
 		if (page) {
@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 				md->anon++;
 			md->node[page_to_nid(page)]++;
 		}
+		cond_resched();
 	}
-	spin_unlock(&mm->page_table_lock);
 	return md;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa8de20e2e80..e8d1424153bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
-
-extern struct page * vmalloc_to_page(void *addr);
-extern unsigned long vmalloc_to_pfn(void *addr);
-extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
-		int write);
-int remap_pfn_range(struct vm_area_struct *, unsigned long,
-		unsigned long, unsigned long, pgprot_t);
+struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+struct page *vmalloc_to_page(void *addr);
+unsigned long vmalloc_to_pfn(void *addr);
+int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+			unsigned long pfn, unsigned long size, pgprot_t);
+
+struct page *follow_page(struct mm_struct *, unsigned long address,
+			unsigned int foll_flags);
+#define FOLL_WRITE	0x01	/* check pte is writable */
+#define FOLL_TOUCH	0x02	/* mark page accessed */
+#define FOLL_GET	0x04	/* do get_page on page */
+#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..3b4d5ad44cc6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	spin_lock(&current->mm->page_table_lock);
-	page = follow_page(mm, uaddr, 0);
+	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		spin_unlock(&current->mm->page_table_lock);
+		put_page(page);
 		return 0;
 	}
-	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
diff --git a/mm/memory.c b/mm/memory.c
index 51f7c0a220d4..8461e2dd91d7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 
 /*
  * Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+			unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
+	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	page = follow_huge_addr(mm, address, write);
-	if (! IS_ERR(page))
-		return page;
+	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+	if (!IS_ERR(page)) {
+		BUG_ON(flags & FOLL_GET);
+		goto out;
+	}
 
+	page = NULL;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto out;
+		goto no_page_table;
 
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto out;
+		goto no_page_table;
 	
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto no_page_table;
+
+	if (pmd_huge(*pmd)) {
+		BUG_ON(flags & FOLL_GET);
+		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
-	if (pmd_huge(*pmd))
-		return follow_huge_pmd(mm, address, pmd, write);
+	}
 
-	ptep = pte_offset_map(pmd, address);
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
 		goto out;
 
 	pte = *ptep;
-	pte_unmap(ptep);
-	if (pte_present(pte)) {
-		if (write && !pte_write(pte))
-			goto out;
-		pfn = pte_pfn(pte);
-		if (pfn_valid(pfn)) {
-			page = pfn_to_page(pfn);
-			if (write && !pte_dirty(pte) &&!PageDirty(page))
-				set_page_dirty(page);
-			mark_page_accessed(page);
-			return page;
-		}
-	}
+	if (!pte_present(pte))
+		goto unlock;
+	if ((flags & FOLL_WRITE) && !pte_write(pte))
+		goto unlock;
+	pfn = pte_pfn(pte);
+	if (!pfn_valid(pfn))
+		goto unlock;
 
+	page = pfn_to_page(pfn);
+	if (flags & FOLL_GET)
+		get_page(page);
+	if (flags & FOLL_TOUCH) {
+		if ((flags & FOLL_WRITE) &&
+		    !pte_dirty(pte) && !PageDirty(page))
+			set_page_dirty(page);
+		mark_page_accessed(page);
+	}
+unlock:
+	pte_unmap_unlock(ptep, ptl);
 out:
-	return NULL;
-}
-
-static inline int
-untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
-			 unsigned long address)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	/* Check if the vma is for an anonymous mapping. */
-	if (vma->vm_ops && vma->vm_ops->nopage)
-		return 0;
-
-	/* Check if page directory entry exists. */
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		return 1;
-
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		return 1;
-
-	/* Check if page middle directory entry exists. */
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		return 1;
+	return page;
 
-	/* There is a pte slot for 'address' in 'mm'. */
-	return 0;
+no_page_table:
+	/*
+	 * When core dumping an enormous anonymous area that nobody
+	 * has touched so far, we don't want to allocate page tables.
+	 */
+	if (flags & FOLL_ANON) {
+		page = ZERO_PAGE(address);
+		if (flags & FOLL_GET)
+			get_page(page);
+		BUG_ON(flags & FOLL_WRITE);
+	}
+	return page;
 }
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int flags;
+	unsigned int vm_flags;
 
 	/* 
 	 * Require read or write permissions.
 	 * If 'force' is set, we only require the "MAY" flags.
 	 */
-	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	i = 0;
 
 	do {
-		struct vm_area_struct *	vma;
+		struct vm_area_struct *vma;
+		unsigned int foll_flags;
 
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(tsk, start)) {
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 
 		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
-				|| !(flags & vma->vm_flags))
+				|| !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
 		if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 						&start, &len, i);
 			continue;
 		}
-		spin_lock(&mm->page_table_lock);
+
+		foll_flags = FOLL_TOUCH;
+		if (pages)
+			foll_flags |= FOLL_GET;
+		if (!write && !(vma->vm_flags & VM_LOCKED) &&
+		    (!vma->vm_ops || !vma->vm_ops->nopage))
+			foll_flags |= FOLL_ANON;
+
 		do {
-			int write_access = write;
 			struct page *page;
 
-			cond_resched_lock(&mm->page_table_lock);
-			while (!(page = follow_page(mm, start, write_access))) {
-				int ret;
-
-				/*
-				 * Shortcut for anonymous pages. We don't want
-				 * to force the creation of pages tables for
-				 * insanely big anonymously mapped areas that
-				 * nobody touched so far. This is important
-				 * for doing a core dump for these mappings.
-				 */
-				if (!write && untouched_anonymous_page(mm,vma,start)) {
-					page = ZERO_PAGE(start);
-					break;
-				}
-				spin_unlock(&mm->page_table_lock);
-				ret = __handle_mm_fault(mm, vma, start, write_access);
+			if (write)
+				foll_flags |= FOLL_WRITE;
 
+			cond_resched();
+			while (!(page = follow_page(mm, start, foll_flags))) {
+				int ret;
+				ret = __handle_mm_fault(mm, vma, start,
+						foll_flags & FOLL_WRITE);
 				/*
 				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
 				 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				 * subsequent page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
-					write_access = 0;
+					foll_flags &= ~FOLL_WRITE;
 				
 				switch (ret & ~VM_FAULT_WRITE) {
 				case VM_FAULT_MINOR:
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				default:
 					BUG();
 				}
-				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
-				page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			start += PAGE_SIZE;
 			len--;
 		} while (len && start < vma->vm_end);
-		spin_unlock(&mm->page_table_lock);
 	} while (len);
 	return i;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index dfb124ffb9be..d1e076a487cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+			unsigned int foll_flags)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:40 -0700
Subject: [PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mm-armv.c       |  1 +
 arch/frv/mm/pgalloc.c       |  4 ++--
 arch/i386/mm/pgtable.c      |  8 ++++----
 arch/um/kernel/skas/mmu.c   |  1 +
 fs/afs/file.c               |  4 ++--
 fs/buffer.c                 |  2 +-
 fs/jfs/jfs_metapage.c       | 12 ++++++------
 fs/xfs/linux-2.6/xfs_buf.c  |  7 ++++---
 include/linux/buffer_head.h |  6 +++---
 include/linux/mm.h          | 46 +++++++++++++++++++++++++++++++++++++--------
 kernel/kexec.c              |  4 ++--
 mm/Kconfig                  | 13 +++++++++++++
 mm/filemap.c                |  2 +-
 mm/memory.c                 | 24 +++++++++++++----------
 mm/mremap.c                 | 11 ++++++++++-
 mm/page_alloc.c             | 16 ++++++++--------
 mm/page_io.c                |  6 ++++--
 mm/rmap.c                   |  4 ++--
 mm/shmem.c                  | 22 ++++++++++------------
 mm/swap.c                   |  2 +-
 mm/swap_state.c             |  8 ++++----
 mm/swapfile.c               | 12 ++++++------
 mm/vmscan.c                 |  2 +-
 23 files changed, 138 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 60f3e039bac2..1221fdde1769 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
 	dec_page_state(nr_page_table_pages);
+	pte_lock_deinit(pte);
 	pte_free(pte);
 	pmd_free(pmd);
 free:
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 4eaec0f3525b..2c67dfe5a6b3 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
 	if (pgd_list)
 		pgd_list->private = (unsigned long) &page->index;
 	pgd_list = page;
-	page->private = (unsigned long) &pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *) page->index;
-	pprev = (struct page **) page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
 		next->private = (unsigned long) pprev;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index dcdce2c6c532..39c099f15b5f 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 	page->index = (unsigned long)pgd_list;
 	if (pgd_list)
-		pgd_list->private = (unsigned long)&page->index;
+		set_page_private(pgd_list, (unsigned long)&page->index);
 	pgd_list = page;
-	page->private = (unsigned long)&pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *)page->index;
-	pprev = (struct page **)page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
-		next->private = (unsigned long)pprev;
+		set_page_private(next, (unsigned long)pprev);
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 02cf36e0331a..9e5e39cea821 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
 
 	if(!proc_mm || !ptrace_faultinfo){
 		free_page(mmu->id.stack);
+		pte_lock_deinit(virt_to_page(mmu->last_page_table));
 		pte_free_kernel((pte_t *) mmu->last_page_table);
                 dec_page_state(nr_page_table_pages);
 #ifdef CONFIG_3_LEVEL_PGTABLES
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d576987ec67..4975c9c193dd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
 		cachefs_uncache_page(vnode->cache, page);
 #endif
 
-		pageio = (struct cachefs_page *) page->private;
-		page->private = 0;
+		pageio = (struct cachefs_page *) page_private(page);
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 
 		if (pageio)
diff --git a/fs/buffer.c b/fs/buffer.c
index b1667986442f..2066e4cb700c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 	page_cache_release(page);
 }
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 26091a5f88d4..8a53981f9f27 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
 	atomic_t io_count;
 	struct metapage *mp[MPS_PER_PAGE];
 };
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
 
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 		if (!a)
 			return -ENOMEM;
 		memset(a, 0, sizeof(struct meta_anchor));
-		page->private = (unsigned long)a;
+		set_page_private(page, (unsigned long)a);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 	a->mp[index] = NULL;
 	if (--a->mp_count == 0) {
 		kfree(a);
-		page->private = 0;
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 		kunmap(page);
 	}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 #else
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
-	return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
 
 static inline int insert_metapage(struct page *page, struct metapage *mp)
 {
 	if (mp) {
-		page->private = (unsigned long)mp;
+		set_page_private(page, (unsigned long)mp);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 
 static inline void remove_metapage(struct page *page, struct metapage *mp)
 {
-	page->private = 0;
+	set_page_private(page, 0);
 	ClearPagePrivate(page);
 	kunmap(page);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba4767c04adf..4cd46abe8434 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -181,8 +181,9 @@ set_page_region(
 	size_t		offset,
 	size_t		length)
 {
-	page->private |= page_region_mask(offset, length);
-	if (page->private == ~0UL)
+	set_page_private(page,
+		page_private(page) | page_region_mask(offset, length));
+	if (page_private(page) == ~0UL)
 		SetPageUptodate(page);
 }
 
@@ -194,7 +195,7 @@ test_page_region(
 {
 	unsigned long	mask = page_region_mask(offset, length);
 
-	return (mask && (page->private & mask) == mask);
+	return (mask && (page_private(page) & mask) == mask);
 }
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 88af42f5e04a..c937d6e65502 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
 	({							\
-		BUG_ON(!PagePrivate(page));		\
-		((struct buffer_head *)(page)->private);	\
+		BUG_ON(!PagePrivate(page));			\
+		((struct buffer_head *)page_private(page));	\
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
 {
 	page_cache_get(page);
 	SetPagePrivate(page);
-	page->private = (unsigned long)head;
+	set_page_private(page, (unsigned long)head);
 }
 
 static inline void get_bh(struct buffer_head *bh)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8d1424153bb..8a514eca40d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -226,13 +226,18 @@ struct page {
 					 * to show when page is mapped
 					 * & limit reverse map searches.
 					 */
-	unsigned long private;		/* Mapping-private opaque data:
+	union {
+		unsigned long private;	/* Mapping-private opaque data:
 					 * usually used for buffer_heads
 					 * if PagePrivate set; used for
 					 * swp_entry_t if PageSwapCache
 					 * When page is free, this indicates
 					 * order in the buddy system.
 					 */
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spinlock_t ptl;
+#endif
+	} u;
 	struct address_space *mapping;	/* If low bit clear, points to
 					 * inode address_space, or NULL.
 					 * If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+#define page_private(page)		((page)->u.private)
+#define set_page_private(page, v)	((page)->u.private = (v))
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-static inline int page_count(struct page *p)
+static inline int page_count(struct page *page)
 {
-	if (PageCompound(p))
-		p = (struct page *)p->private;
-	return atomic_read(&(p)->_count) + 1;
+	if (PageCompound(page))
+		page = (struct page *)page_private(page);
+	return atomic_read(&page->_count) + 1;
 }
 
 static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 	atomic_inc(&page->_count);
 }
 
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
 static inline pgoff_t page_index(struct page *page)
 {
 	if (unlikely(PageSwapCache(page)))
-		return page->private;
+		return page_private(page);
 	return page->index;
 }
 
@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page)	&((page)->u.ptl)
+#define pte_lock_init(_page)	do {					\
+	spin_lock_init(__pte_lockptr(_page));				\
+} while (0)
+#define pte_lock_deinit(page)	((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page)	do {} while (0)
+#define pte_lock_deinit(page)	do {} while (0)
+#define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
 #define pte_offset_map_lock(mm, pmd, address, ptlp)	\
 ({							\
-	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	spinlock_t *__ptl = pte_lockptr(mm, pmd);	\
 	pte_t *__pte = pte_offset_map(pmd, address);	\
 	*(ptlp) = __ptl;				\
 	spin_lock(__ptl);				\
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 36c5d9cd4cc1..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
-		pages->private = order;
+		set_page_private(pages, order);
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
 
-	order = page->private;
+	order = page_private(page);
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d136..f35a550ba4b9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,16 @@ config SPARSEMEM_STATIC
 config SPARSEMEM_EXTREME
 	def_bool y
 	depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+	int
+	default "4096" if ARM && !CPU_CACHE_VIPT
+	default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+	default "4"
diff --git a/mm/filemap.c b/mm/filemap.c
index 8aa344e88489..f560b41c8f61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -152,7 +152,7 @@ static int sync_page(void *word)
 	 * in the ->sync_page() methods make essential use of the
 	 * page_mapping(), merely passing the page down to the backing
 	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page->private is
+	 * ignore it for all cases but swap, where only page_private(page) is
 	 * of interest. When page_mapping() does go NULL, the entire
 	 * call stack gracefully ignores the page and returns.
 	 * -- wli
diff --git a/mm/memory.c b/mm/memory.c
index 8461e2dd91d7..e9ef599498b5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = pmd_page(*pmd);
 	pmd_clear(pmd);
+	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
 	dec_page_state(nr_page_table_pages);
 	tlb->mm->nr_ptes--;
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
-	if (pmd_present(*pmd))		/* Another has populated it */
+	if (pmd_present(*pmd)) {	/* Another has populated it */
+		pte_lock_deinit(new);
 		pte_free(new);
-	else {
+	} else {
 		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
@@ -432,7 +435,7 @@ again:
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
-	src_ptl = &src_mm->page_table_lock;
+	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock(src_ptl);
 
 	do {
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
  * (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page and do_no_page can safely check later on).
  */
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 				pte_t *page_table, pte_t orig_pte)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spin_lock(&mm->page_table_lock);
+		spinlock_t *ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
 		same = pte_same(*page_table, orig_pte);
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 	}
 #endif
 	pte_unmap(page_table);
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
-	if (!pte_unmap_same(mm, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page_cache_get(page);
 		entry = mk_pte(page, vma->vm_page_prot);
 
-		ptl = &mm->page_table_lock;
+		ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
 		if (!pte_none(*page_table))
 			goto release;
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t pgoff;
 	int err;
 
-	if (!pte_unmap_same(mm, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return VM_FAULT_MINOR;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 					pte, pmd, write_access, entry);
 	}
 
-	ptl = &mm->page_table_lock;
+	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8de77b632a20..b535438c363c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
-	spinlock_t *old_ptl;
+	spinlock_t *old_ptl, *new_ptl;
 
 	if (vma->vm_file) {
 		/*
@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			new_vma->vm_truncate_count = 0;
 	}
 
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * pte locks because exclusive mmap_sem prevents deadlock.
+	 */
 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
  	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	new_ptl = pte_lockptr(mm, new_pmd);
+	if (new_ptl != old_ptl)
+		spin_lock(new_ptl);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
 
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0541288ebf4b..a2995a5d012c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 		struct page *p = page + i;
 
 		SetPageCompound(p);
-		p->private = (unsigned long)page;
+		set_page_private(p, (unsigned long)page);
 	}
 }
 
@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
-		if (p->private != (unsigned long)page)
+		if (page_private(p) != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page) {
-	return page->private;
+	return page_private(page);
 }
 
 static inline void set_page_order(struct page *page, int order) {
-	page->private = order;
+	set_page_private(page, order);
 	__SetPagePrivate(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 }
 
 /*
@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
  * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
  * free, the remainder of the region must be split into blocks.   
@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
-	page->private = 0;
+	set_page_private(page, 0);
 	set_page_refs(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db00..bb2b0d53889c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		goto out;
 	}
-	bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+	bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+				end_swap_bio_write);
 	if (bio == NULL) {
 		set_page_dirty(page);
 		unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
 
 	BUG_ON(!PageLocked(page));
 	ClearPageUptodate(page);
-	bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+				end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
 		ret = -ENOMEM;
diff --git a/mm/rmap.c b/mm/rmap.c
index a84bdfe582c0..a33e779d1bd8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 		return NULL;
 	}
 
-	ptl = &mm->page_table_lock;
+	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
@@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	update_hiwater_rss(mm);
 
 	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
+		swp_entry_t entry = { .val = page_private(page) };
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
diff --git a/mm/shmem.c b/mm/shmem.c
index 37777f4c11f8..dc25565a61e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped		private
-
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
 	SGP_QUICK,	/* don't try more than file page cache lookup */
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
 
 	entry->val = value;
 	info->swapped += incdec;
-	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
-		kmap_atomic_to_page(entry)->nr_swapped += incdec;
+	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+		struct page *page = kmap_atomic_to_page(entry);
+		set_page_private(page, page_private(page) + incdec);
+	}
 }
 
 /*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 
 		spin_unlock(&info->lock);
 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
-		if (page) {
-			page->nr_swapped = 0;
-		}
+		if (page)
+			set_page_private(page, 0);
 		spin_lock(&info->lock);
 
 		if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
 			diroff = 0;
 		}
 		subdir = dir[diroff];
-		if (subdir && subdir->nr_swapped) {
+		if (subdir && page_private(subdir)) {
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
 				size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
 			nr_swaps_freed += freed;
 			if (offset)
 				spin_lock(&info->lock);
-			subdir->nr_swapped -= freed;
+			set_page_private(subdir, page_private(subdir) - freed);
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(subdir->nr_swapped > offset);
+			BUG_ON(page_private(subdir) > offset);
 		}
 		if (offset)
 			offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 			dir = shmem_dir_map(subdir);
 		}
 		subdir = *dir;
-		if (subdir && subdir->nr_swapped) {
+		if (subdir && page_private(subdir)) {
 			ptr = shmem_swp_map(subdir);
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
diff --git a/mm/swap.c b/mm/swap.c
index 21d15f99805c..b89512877ec2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
 void put_page(struct page *page)
 {
 	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 		if (put_page_testzero(page)) {
 			void (*dtor)(struct page *page);
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a7..cafc1edcbeba 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
-			page->private = entry.val;
+			set_page_private(page, entry.val);
 			total_swapcache_pages++;
 			pagecache_acct(1);
 		}
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page->private);
-	page->private = 0;
+	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
 	pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 
 	write_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 510f0039b000..8970c0b74194 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	swp_entry_t entry;
 
 	down_read(&swap_unplug_sem);
-	entry.val = page->private;
+	entry.val = page_private(page);
 	if (PageSwapCache(page)) {
 		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
 		struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 		/*
 		 * If the page is removed from swapcache from under us (with a
 		 * racy try_to_unuse/swapoff) we need an additional reference
-		 * count to avoid reading garbage from page->private above. If
-		 * the WARN_ON triggers during a swapoff it maybe the race
+		 * count to avoid reading garbage from page_private(page) above.
+		 * If the WARN_ON triggers during a swapoff it maybe the race
 		 * condition and it's harmless. However if it triggers without
 		 * swapoff it signals a problem.
 		 */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
 	struct swap_info_struct *p;
 	swp_entry_t entry;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (page_count(page) != 2) /* 2: us + cache */
 		return 0;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (!p)
 		return 0;
@@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page)
 	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
 
 	if (PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page->private };
+		swp_entry_t entry = { .val = page_private(page) };
 		struct swap_info_struct *sis;
 
 		sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 41d1064aabfb..135bf8ca96ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page->private };
+			swp_entry_t swap = { .val = page_private(page) };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
-- 
cgit v1.2.3


From 4276d32260662d5401a15a0a46e506fb5c8ab563 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 30 Oct 2005 14:59:18 -0800
Subject: [PATCH] Remove redundant configs.o

Since CONFIG_IKCONFIG_PROC already depends on CONFIG_IKCONFIG, adding
configs.o again is redundant.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index ff4dc02ce170..980b5e454441 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-- 
cgit v1.2.3


From c32b6b8e524d2c337767d312814484d9289550cf Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sun, 30 Oct 2005 14:59:54 -0800
Subject: [PATCH] create and destroy cpufreq sysfs entries based on cpu
 notifiers

cpufreq entries in sysfs should only be populated when CPU is online state.
 When we either boot with maxcpus=x and then boot the other cpus by echoing
to sysfs online file, these entries should be created and destroyed when
CPU_DEAD is notified.  Same treatement as cache entries under sysfs.

We place the processor in the lowest frequency, so hw managed P-State
transitions can still work on the other threads to save power.

Primary goal was to just make these directories appear/disapper dynamically.

There is one in this patch i had to do, which i really dont like myself but
probably best if someone handling the cpufreq infrastructure could give
this code right treatment if this is not acceptable.  I guess its probably
good for the first cut.

- Converting lock_cpu_hotplug()/unlock_cpu_hotplug() to disable/enable preempt.
  The locking was smack in the middle of the notification path, when the
  hotplug is already holding the lock. I tried another solution to avoid this
  so avoid taking locks if we know we are from notification path. The solution
  was getting very ugly and i decided this was probably good for this iteration
  until someone who understands cpufreq could do a better job than me.

(akpm: export cpucontrol to GPL modules: drivers/cpufreq/cpufreq_stats.c now
does lock_cpu_hotplug())

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Zwane Mwaikambo <zwane@holomorphy.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cpufreq/cpufreq.c       | 69 ++++++++++++++++++++++++++++++++++++++---
 drivers/cpufreq/cpufreq_stats.c | 42 ++++++++++++++++++++++---
 kernel/cpu.c                    |  1 +
 3 files changed, 103 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 346906a96e8b..6c6121b85a54 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -4,6 +4,9 @@
  *  Copyright (C) 2001 Russell King
  *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
  *
+ *  Oct 2005 - Ashok Raj <ashok.raj@intel.com>
+ *         		Added handling for CPU hotplug
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -567,6 +570,9 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 	unsigned long flags;
 	unsigned int j;
 
+	if (cpu_is_offline(cpu))
+		return 0;
+
 	cpufreq_debug_disable_ratelimit();
 	dprintk("adding CPU %u\n", cpu);
 
@@ -673,7 +679,7 @@ err_out:
 
 nomem_out:
 	module_put(cpufreq_driver->owner);
- module_out:
+module_out:
 	cpufreq_debug_enable_ratelimit();
 	return ret;
 }
@@ -762,7 +768,6 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 	down(&data->lock);
 	if (cpufreq_driver->target)
 		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
-	cpufreq_driver->target = NULL;
 	up(&data->lock);
 
 	kobject_unregister(&data->kobj);
@@ -1109,17 +1114,30 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 			    unsigned int relation)
 {
 	int retval = -EINVAL;
-	lock_cpu_hotplug();
+
+	/*
+	 * Converted the lock_cpu_hotplug to preempt_disable()
+	 * and preempt_enable(). This is a bit kludgy and relies on how cpu
+	 * hotplug works. All we need is a guarantee that cpu hotplug won't make
+	 * progress on any cpu. Once we do preempt_disable(), this would ensure
+	 * that hotplug threads don't get onto this cpu, thereby delaying
+	 * the cpu remove process.
+	 *
+	 * We removed the lock_cpu_hotplug since we need to call this function
+	 * via cpu hotplug callbacks, which result in locking the cpu hotplug
+	 * thread itself. Agree this is not very clean, cpufreq community
+	 * could improve this if required. - Ashok Raj <ashok.raj@intel.com>
+	 */
+	preempt_disable();
 	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
-	unlock_cpu_hotplug();
+	preempt_enable();
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
 
-
 int cpufreq_driver_target(struct cpufreq_policy *policy,
 			  unsigned int target_freq,
 			  unsigned int relation)
@@ -1406,6 +1424,45 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
+static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpufreq_policy *policy;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+
+	if (sys_dev) {
+		switch (action) {
+		case CPU_ONLINE:
+			cpufreq_add_dev(sys_dev);
+			break;
+		case CPU_DOWN_PREPARE:
+			/*
+			 * We attempt to put this cpu in lowest frequency
+			 * possible before going down. This will permit
+			 * hardware-managed P-State to switch other related
+			 * threads to min or higher speeds if possible.
+			 */
+			policy = cpufreq_cpu_data[cpu];
+			if (policy) {
+				cpufreq_driver_target(policy, policy->min,
+						CPUFREQ_RELATION_H);
+			}
+			break;
+		case CPU_DEAD:
+			cpufreq_remove_dev(sys_dev);
+			break;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_cpu_notifier =
+{
+    .notifier_call = cpufreq_cpu_callback,
+};
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1466,6 +1523,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	}
 
 	if (!ret) {
+		register_cpu_notifier(&cpufreq_cpu_notifier);
 		dprintk("driver %s up and running\n", driver_data->name);
 		cpufreq_debug_enable_ratelimit();
 	}
@@ -1497,6 +1555,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	dprintk("unregistering driver %s\n", driver->name);
 
 	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+	unregister_cpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 741b6b191e6a..3597f25d5efa 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -19,6 +19,7 @@
 #include <linux/percpu.h>
 #include <linux/kobject.h>
 #include <linux/spinlock.h>
+#include <linux/notifier.h>
 #include <asm/cputime.h>
 
 static spinlock_t cpufreq_stats_lock;
@@ -298,6 +299,27 @@ cpufreq_stat_notifier_trans (struct notifier_block *nb, unsigned long val,
 	return 0;
 }
 
+static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		cpufreq_update_policy(cpu);
+		break;
+	case CPU_DEAD:
+		cpufreq_stats_free_table(cpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_stat_cpu_notifier =
+{
+	.notifier_call = cpufreq_stat_cpu_callback,
+};
+
 static struct notifier_block notifier_policy_block = {
 	.notifier_call = cpufreq_stat_notifier_policy
 };
@@ -311,6 +333,7 @@ __init cpufreq_stats_init(void)
 {
 	int ret;
 	unsigned int cpu;
+
 	spin_lock_init(&cpufreq_stats_lock);
 	if ((ret = cpufreq_register_notifier(&notifier_policy_block,
 				CPUFREQ_POLICY_NOTIFIER)))
@@ -323,20 +346,31 @@ __init cpufreq_stats_init(void)
 		return ret;
 	}
 
-	for_each_cpu(cpu)
-		cpufreq_update_policy(cpu);
+	register_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	lock_cpu_hotplug();
+	for_each_online_cpu(cpu) {
+		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE,
+			(void *)(long)cpu);
+	}
+	unlock_cpu_hotplug();
 	return 0;
 }
 static void
 __exit cpufreq_stats_exit(void)
 {
 	unsigned int cpu;
+
 	cpufreq_unregister_notifier(&notifier_policy_block,
 			CPUFREQ_POLICY_NOTIFIER);
 	cpufreq_unregister_notifier(&notifier_trans_block,
 			CPUFREQ_TRANSITION_NOTIFIER);
-	for_each_cpu(cpu)
-		cpufreq_stats_free_table(cpu);
+	unregister_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	lock_cpu_hotplug();
+	for_each_online_cpu(cpu) {
+		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD,
+			(void *)(long)cpu);
+	}
+	unlock_cpu_hotplug();
 }
 
 MODULE_AUTHOR ("Zou Nan hai <nanhai.zou@intel.com>");
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12e..3619e939182e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
 
 /* This protects CPUs going up and down... */
 DECLARE_MUTEX(cpucontrol);
+EXPORT_SYMBOL_GPL(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
-- 
cgit v1.2.3


From 351619baf9878731b4272fa10dda0f84f5582241 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:55 -0800
Subject: [PATCH] swsusp: rework image freeing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following patch makes swsusp use PG_nosave and PG_nosave_free flags to
mark pages that should be freed after the state of the system has been
restored from the image (or in case of an error during suspend).

This allows us to avoid storing metadata in swap twice and to reduce the
amount of memory needed by swsusp.  �Additionally, it allows us to simplify
the code by removing a couple of functions that are no longer necessary.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 159 ++++++++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 016504ccfccf..ae46506e2137 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -708,24 +708,28 @@ static void count_data_pages(void)
 	}
 }
 
-
 static void copy_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe * pbe = pagedir_nosave;
+	struct pbe *pbe = pagedir_nosave, *p;
 
 	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe(p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
 				struct page * page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
-				pbe->orig_address = (long) page_address(page);
+				pbe->orig_address = (unsigned long)page_address(page);
 				/* copy_page is not usable for copying task structs. */
 				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
 				pbe = pbe->next;
@@ -736,15 +740,6 @@ static void copy_data_pages(void)
 }
 
 
-/**
- *	calc_nr - Determine the number of pages needed for a pbe list.
- */
-
-static int calc_nr(int nr_copy)
-{
-	return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
-}
-
 /**
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
@@ -755,6 +750,8 @@ static inline void free_pagedir(struct pbe *pblist)
 
 	while (pblist) {
 		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
 		free_page((unsigned long)pblist);
 		pblist = pbe;
 	}
@@ -800,6 +797,16 @@ static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
 	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
@@ -822,11 +829,11 @@ static struct pbe * alloc_pagedir(unsigned nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	pblist = (struct pbe *)alloc_image_page();
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+		pbe->next = (struct pbe *)alloc_image_page();
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
@@ -836,51 +843,29 @@ static struct pbe * alloc_pagedir(unsigned nr_pages)
 }
 
 /**
- *	free_image_pages - Free pages allocated for snapshot
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
  */
 
-static void free_image_pages(void)
+void swsusp_free(void)
 {
-	struct pbe * p;
+	struct zone *zone;
+	unsigned long zone_pfn;
 
-	for_each_pbe (p, pagedir_save) {
-		if (p->address) {
-			ClearPageNosave(virt_to_page(p->address));
-			free_page(p->address);
-			p->address = 0;
-		}
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
 	}
 }
 
-/**
- *	alloc_image_pages - Allocate pages for the snapshot.
- */
-
-static int alloc_image_pages(void)
-{
-	struct pbe * p;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-		if (!p->address)
-			return -ENOMEM;
-		SetPageNosave(virt_to_page(p->address));
-	}
-	return 0;
-}
-
-/* Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
- */
-void swsusp_free(void)
-{
-	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
-	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
-	free_image_pages();
-	free_pagedir(pagedir_save);
-}
-
-
 /**
  *	enough_free_mem - Make sure we enough free memory to snapshot.
  *
@@ -890,12 +875,9 @@ void swsusp_free(void)
 
 static int enough_free_mem(void)
 {
-	if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
-		pr_debug("swsusp: Not enough free pages: Have %d\n",
-			 nr_free_pages());
-		return 0;
-	}
-	return 1;
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
 
@@ -914,33 +896,16 @@ static int enough_swap(void)
 	struct sysinfo i;
 
 	si_swapinfo(&i);
-	if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
-		pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
-		return 0;
-	}
-	return 1;
+	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+	return i.freeswap > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
 static int swsusp_alloc(void)
 {
-	int error;
+	struct pbe * p;
 
 	pagedir_nosave = NULL;
-	nr_copy_pages = calc_nr(nr_copy_pages);
-	nr_copy_pages_check = nr_copy_pages;
-
-	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
-		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem())
-		return -ENOMEM;
-
-	if (!enough_swap())
-		return -ENOSPC;
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE))
-		return -ENOSPC;
 
 	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
@@ -948,10 +913,14 @@ static int swsusp_alloc(void)
 	}
 	create_pbe_list(pagedir_save, nr_copy_pages);
 	pagedir_nosave = pagedir_save;
-	if ((error = alloc_image_pages())) {
-		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-		swsusp_free();
-		return error;
+
+	for_each_pbe (p, pagedir_save) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return -ENOMEM;
+		}
 	}
 
 	return 0;
@@ -963,7 +932,7 @@ static int suspend_prepare_image(void)
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
-		printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
 		restore_highmem();
 		return -ENOMEM;
 	}
@@ -971,6 +940,28 @@ static int suspend_prepare_image(void)
 	drain_local_pages();
 	count_data_pages();
 	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+	nr_copy_pages_check = nr_copy_pages;
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_copy_pages,
+		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem()) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE)) {
+		printk(KERN_ERR "swsusp: Too many image pages\n");
+		return -ENOSPC;
+	}
+
+	if (!enough_swap()) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
 
 	error = swsusp_alloc();
 	if (error)
-- 
cgit v1.2.3


From 25761b6eb7b33823bcfff6bfe2a015badcd76fb8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:56 -0800
Subject: [PATCH] swsusp: move snapshot functionality to separate file

The following patch moves the functionality of swsusp related to creating and
handling the snapshot of memory to a separate file, snapshot.c

This should enable us to untangle the code in the future and eventually to
implement some parts of swsusp.c in the user space.

The patch does not change the code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   6 +
 kernel/power/Makefile   |   2 +-
 kernel/power/power.h    |  17 ++
 kernel/power/snapshot.c | 464 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c   | 440 +--------------------------------------------
 5 files changed, 492 insertions(+), 437 deletions(-)
 create mode 100644 kernel/power/snapshot.c

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index ba448c760168..380915e9563d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -74,4 +74,10 @@ void __restore_processor_state(struct saved_context *ctxt);
 extern unsigned long get_usable_page(gfp_t gfp_mask);
 extern void free_eaten_memory(void);
 
+/*
+ * XXX: We try to keep some more pages free so that I/O operations succeed
+ * without paging. Might this be more?
+ */
+#define PAGES_FOR_IO	512
+
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa13..c71eb4579c07 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,7 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
 obj-y				:= main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6748de23e83c..e54dd8435de7 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,3 +53,20 @@ extern void thaw_processes(void);
 
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+extern unsigned int nr_copy_pages;
+extern suspend_pagedir_t *pagedir_nosave;
+extern suspend_pagedir_t *pagedir_save;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern int restore_highmem(void);
+extern void free_pagedir(struct pbe *pblist);
+extern struct pbe * alloc_pagedir(unsigned nr_pages);
+extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern int enough_swap(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000000..0f0a7f306b0d
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,464 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file is to realize architecture-independent
+ * machine suspend feature using pretty near only high-level routines
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2, and is based on swsusp.c.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/bitops.h>
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/keyboard.h>
+#include <linux/spinlock.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <linux/bio.h>
+#include <linux/mount.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
+#include "power.h"
+
+
+
+
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+
+static int save_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem\n");
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+#endif
+	return 0;
+}
+
+int restore_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+#endif
+	return 0;
+}
+
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone * zone, unsigned long * zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page * page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+		pr_debug("[nosave pfn 0x%lx]", pfn);
+		return 0;
+	}
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+static void count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	nr_copy_pages = 0;
+
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			nr_copy_pages += saveable(zone, &zone_pfn);
+	}
+}
+
+static void copy_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbe = pagedir_nosave, *p;
+
+	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe(p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p->address));
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (unsigned long)page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+struct pbe * alloc_pagedir(unsigned nr_pages)
+{
+	unsigned num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = (struct pbe *)alloc_image_page();
+	/* FIXME: rewrite this ugly loop */
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = (struct pbe *)alloc_image_page();
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	return pblist;
+}
+
+/**
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
+
+void swsusp_free(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
+	}
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available
+ *	free pages.
+ */
+
+static int enough_free_mem(void)
+{
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+}
+
+
+static int swsusp_alloc(void)
+{
+	struct pbe * p;
+
+	pagedir_nosave = NULL;
+
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE))
+		return -ENOSPC;
+
+	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return -ENOMEM;
+	}
+	create_pbe_list(pagedir_save, nr_copy_pages);
+	pagedir_nosave = pagedir_save;
+
+	for_each_pbe (p, pagedir_save) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int suspend_prepare_image(void)
+{
+	int error;
+
+	pr_debug("swsusp: critical section: \n");
+	if (save_highmem()) {
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
+		restore_highmem();
+		return -ENOMEM;
+	}
+
+	drain_local_pages();
+	count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_copy_pages,
+		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem()) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (!enough_swap()) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+
+	error = swsusp_alloc();
+	if (error)
+		return error;
+
+	/* During allocating of suspend pagedir, new cold pages may appear.
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages();
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	return 0;
+}
+
+
+asmlinkage int swsusp_save(void)
+{
+	return suspend_prepare_image();
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae46506e2137..fc50b5d2dd26 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -5,7 +5,7 @@
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -84,16 +84,10 @@
 #define MAXKEY 32
 #define MAXIV  32
 
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
-/* Variables to be preserved over suspend */
-static int nr_copy_pages_check;
-
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
-static unsigned int nr_copy_pages __nosavedata = 0;
+unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume
@@ -109,7 +103,7 @@ static unsigned int nr_copy_pages __nosavedata = 0;
    MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-static suspend_pagedir_t *pagedir_save;
+suspend_pagedir_t *pagedir_save;
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
@@ -123,12 +117,6 @@ static struct swsusp_header {
 
 static struct swsusp_info swsusp_info;
 
-/*
- * XXX: We try to keep some more pages free so that I/O operations succeed
- * without paging. Might this be more?
- */
-#define PAGES_FOR_IO	512
-
 /*
  * Saving part...
  */
@@ -552,335 +540,6 @@ static int write_suspend_image(void)
 	goto Done;
 }
 
-
-#ifdef CONFIG_HIGHMEM
-struct highmem_page {
-	char *data;
-	struct page *page;
-	struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
-	unsigned long zone_pfn;
-	mark_free_pages(zone);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-		struct page *page;
-		struct highmem_page *save;
-		void *kaddr;
-		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
-		if (!(pfn%1000))
-			printk(".");
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		/*
-		 * PageReserved results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations.
-		 *
-		 * rvmalloc should not cause this, because all implementations
-		 * appear to always be using vmalloc_32 on architectures with
-		 * highmem. This is a good thing, because we would like to save
-		 * rvmalloc pages.
-		 *
-		 * It appears to be triggered by pages which do not point to
-		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
-		 * which sets PageReserved if the page does not point to valid
-		 * RAM.
-		 *
-		 * XXX: must remove usage of PageReserved!
-		 */
-		if (PageReserved(page))
-			continue;
-		BUG_ON(PageNosave(page));
-		if (PageNosaveFree(page))
-			continue;
-		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-		if (!save)
-			return -ENOMEM;
-		save->next = highmem_copy;
-		save->page = page;
-		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data) {
-			kfree(save);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memcpy(save->data, kaddr, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		highmem_copy = save;
-	}
-	return 0;
-}
-#endif /* CONFIG_HIGHMEM */
-
-
-static int save_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	struct zone *zone;
-	int res = 0;
-
-	pr_debug("swsusp: Saving Highmem\n");
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			res = save_highmem_zone(zone);
-		if (res)
-			return res;
-	}
-#endif
-	return 0;
-}
-
-static int restore_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	printk("swsusp: Restoring Highmem\n");
-	while (highmem_copy) {
-		struct highmem_page *save = highmem_copy;
-		void *kaddr;
-		highmem_copy = save->next;
-
-		kaddr = kmap_atomic(save->page, KM_USER0);
-		memcpy(kaddr, save->data, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long) save->data);
-		kfree(save);
-	}
-#endif
-	return 0;
-}
-
-
-static int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-/**
- *	saveable - Determine whether a page should be cloned or not.
- *	@pfn:	The page
- *
- *	We save a page if it's Reserved, and not in the range of pages
- *	statically defined as 'unsaveable', or if it isn't reserved, and
- *	isn't part of a free chunk of pages.
- */
-
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
-{
-	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
-
-	if (!pfn_valid(pfn))
-		return 0;
-
-	page = pfn_to_page(pfn);
-	if (PageNosave(page))
-		return 0;
-	if (pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
-		return 0;
-	}
-	if (PageNosaveFree(page))
-		return 0;
-
-	return 1;
-}
-
-static void count_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	nr_copy_pages = 0;
-
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
-	}
-}
-
-static void copy_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *pbe = pagedir_nosave, *p;
-
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pagedir_nosave)
-			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe(p, pagedir_nosave)
-			SetPageNosaveFree(virt_to_page(p->address));
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				BUG_ON(!pbe);
-				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page is not usable for copying task structs. */
-				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
-				pbe = pbe->next;
-			}
-		}
-	}
-	BUG_ON(pbe);
-}
-
-
-/**
- *	free_pagedir - free pages allocated with alloc_pagedir()
- */
-
-static inline void free_pagedir(struct pbe *pblist)
-{
-	struct pbe *pbe;
-
-	while (pblist) {
-		pbe = (pblist + PB_PAGE_SKIP)->next;
-		ClearPageNosave(virt_to_page(pblist));
-		ClearPageNosaveFree(virt_to_page(pblist));
-		free_page((unsigned long)pblist);
-		pblist = pbe;
-	}
-}
-
-/**
- *	fill_pb_page - Create a list of PBEs on a given memory page
- */
-
-static inline void fill_pb_page(struct pbe *pbpage)
-{
-	struct pbe *p;
-
-	p = pbpage;
-	pbpage += PB_PAGE_SKIP;
-	do
-		p->next = p + 1;
-	while (++p < pbpage);
-}
-
-/**
- *	create_pbe_list - Create a list of PBEs on top of a given chain
- *	of memory pages allocated with alloc_pagedir()
- */
-
-static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
-{
-	struct pbe *pbpage, *p;
-	unsigned num = PBES_PER_PAGE;
-
-	for_each_pb_page (pbpage, pblist) {
-		if (num >= nr_pages)
-			break;
-
-		fill_pb_page(pbpage);
-		num += PBES_PER_PAGE;
-	}
-	if (pbpage) {
-		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-			p->next = p + 1;
-		p->next = NULL;
-	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
-}
-
-static void *alloc_image_page(void)
-{
-	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-	if (res) {
-		SetPageNosave(virt_to_page(res));
-		SetPageNosaveFree(virt_to_page(res));
-	}
-	return res;
-}
-
-/**
- *	alloc_pagedir - Allocate the page directory.
- *
- *	First, determine exactly how many pages we need and
- *	allocate them.
- *
- *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *	struct pbe elements (pbes) and the last element in the page points
- *	to the next page.
- *
- *	On each page we set up a list of struct_pbe elements.
- */
-
-static struct pbe * alloc_pagedir(unsigned nr_pages)
-{
-	unsigned num;
-	struct pbe *pblist, *pbe;
-
-	if (!nr_pages)
-		return NULL;
-
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)alloc_image_page();
-	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-        		pbe = pbe->next, num += PBES_PER_PAGE) {
-		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)alloc_image_page();
-	}
-	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist);
-		pblist = NULL;
-        }
-	return pblist;
-}
-
-/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
- */
-
-void swsusp_free(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	for_each_zone(zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				if (PageNosave(page) && PageNosaveFree(page)) {
-					ClearPageNosave(page);
-					ClearPageNosaveFree(page);
-					free_page((long) page_address(page));
-				}
-			}
-	}
-}
-
-/**
- *	enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *	Returns TRUE or FALSE after checking the number of available
- *	free pages.
- */
-
-static int enough_free_mem(void)
-{
-	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
-}
-
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -891,7 +550,7 @@ static int enough_free_mem(void)
  *	We should only consider resume_device.
  */
 
-static int enough_swap(void)
+int enough_swap(void)
 {
 	struct sysinfo i;
 
@@ -901,88 +560,6 @@ static int enough_swap(void)
 		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
-static int swsusp_alloc(void)
-{
-	struct pbe * p;
-
-	pagedir_nosave = NULL;
-
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
-		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
-	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = (unsigned long)alloc_image_page();
-		if (!p->address) {
-			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-			swsusp_free();
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
-static int suspend_prepare_image(void)
-{
-	int error;
-
-	pr_debug("swsusp: critical section: \n");
-	if (save_highmem()) {
-		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
-		restore_highmem();
-		return -ENOMEM;
-	}
-
-	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
-	nr_copy_pages_check = nr_copy_pages;
-
-	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_copy_pages,
-		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
-		 PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem()) {
-		printk(KERN_ERR "swsusp: Not enough free memory\n");
-		return -ENOMEM;
-	}
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE)) {
-		printk(KERN_ERR "swsusp: Too many image pages\n");
-		return -ENOSPC;
-	}
-
-	if (!enough_swap()) {
-		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
-	}
-
-	error = swsusp_alloc();
-	if (error)
-		return error;
-
-	/* During allocating of suspend pagedir, new cold pages may appear.
-	 * Kill them.
-	 */
-	drain_local_pages();
-	copy_data_pages();
-
-	/*
-	 * End of critical section. From now on, we can write to memory,
-	 * but we should not touch disk. This specially means we must _not_
-	 * touch swap space! Except we must write out our image of course.
-	 */
-
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
-	return 0;
-}
-
 
 /* It is important _NOT_ to umount filesystems at this point. We want
  * them synced (in case something goes wrong) but we DO not want to mark
@@ -1002,14 +579,6 @@ int swsusp_write(void)
 }
 
 
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
 
 int swsusp_suspend(void)
 {
@@ -1041,7 +610,6 @@ int swsusp_suspend(void)
 		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	restore_highmem();
 	device_power_up();
 	local_irq_enable();
-- 
cgit v1.2.3


From a0f496517f3e28d651d0cbbcf2d4fb701ed6957e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:57 -0800
Subject: [PATCH] swsusp: reduce the use of global variables

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |  2 +-
 kernel/power/snapshot.c | 78 ++++++++++++++++++++++++-------------------------
 kernel/power/swsusp.c   |  6 ++--
 3 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e54dd8435de7..28afcb090149 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -69,4 +69,4 @@ extern int restore_highmem(void);
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe * alloc_pagedir(unsigned nr_pages);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
-extern int enough_swap(void);
+extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0f0a7f306b0d..03916cf3ff02 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -187,37 +187,38 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 	return 1;
 }
 
-static void count_data_pages(void)
+static unsigned count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
+	unsigned n;
 
-	nr_copy_pages = 0;
-
+	n = 0;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
+			n += saveable(zone, &zone_pfn);
 	}
+	return n;
 }
 
-static void copy_data_pages(void)
+static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe *pbe = pagedir_nosave, *p;
+	struct pbe *pbe, *p;
 
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	pbe = pblist;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
 		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pagedir_nosave)
+		for_each_pb_page (p, pblist)
 			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe(p, pagedir_nosave)
+		for_each_pbe (p, pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
@@ -370,46 +371,39 @@ void swsusp_free(void)
  *	free pages.
  */
 
-static int enough_free_mem(void)
+static int enough_free_mem(unsigned nr_pages)
 {
 	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
-static int swsusp_alloc(void)
+static struct pbe *swsusp_alloc(unsigned nr_pages)
 {
-	struct pbe * p;
-
-	pagedir_nosave = NULL;
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE))
-		return -ENOSPC;
+	struct pbe *pblist, *p;
 
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+	if (!(pblist = alloc_pagedir(nr_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
+		return NULL;
 	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
+	create_pbe_list(pblist, nr_pages);
 
-	for_each_pbe (p, pagedir_save) {
+	for_each_pbe (p, pblist) {
 		p->address = (unsigned long)alloc_image_page();
 		if (!p->address) {
 			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 			swsusp_free();
-			return -ENOMEM;
+			return NULL;
 		}
 	}
 
-	return 0;
+	return pblist;
 }
 
 static int suspend_prepare_image(void)
 {
-	int error;
+	unsigned nr_pages;
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
@@ -419,33 +413,37 @@ static int suspend_prepare_image(void)
 	}
 
 	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+	nr_pages = count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages);
 
 	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_copy_pages,
-		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 nr_pages,
+		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	if (!enough_free_mem()) {
+	/* This is needed because of the fixed size of swsusp_info */
+	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
+		return -ENOSPC;
+
+	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
 	}
 
-	if (!enough_swap()) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	error = swsusp_alloc();
-	if (error)
-		return error;
+	pagedir_nosave = swsusp_alloc(nr_pages);
+	if (!pagedir_nosave)
+		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages();
+	copy_data_pages(pagedir_nosave);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -453,7 +451,9 @@ static int suspend_prepare_image(void)
 	 * touch swap space! Except we must write out our image of course.
 	 */
 
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	nr_copy_pages = nr_pages;
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index fc50b5d2dd26..f6abfdb0a02a 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -550,14 +550,14 @@ static int write_suspend_image(void)
  *	We should only consider resume_device.
  */
 
-int enough_swap(void)
+int enough_swap(unsigned nr_pages)
 {
 	struct sysinfo i;
 
 	si_swapinfo(&i);
 	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
-- 
cgit v1.2.3


From 2c1b4a5ca48831595979a850f40ced8e7da026f8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: rework memory freeing on resume

The following patch makes swsusp use the PG_nosave and PG_nosave_free flags to
mark pages that should be freed in case of an error during resume.

This allows us to simplify the code and to use swsusp_free() in all of the
swsusp's resume error paths, which makes them actually work.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c |  84 ++++++++-------------------------
 include/linux/suspend.h      |   3 +-
 kernel/power/disk.c          |  14 +++---
 kernel/power/power.h         |   2 +-
 kernel/power/snapshot.c      |   2 +-
 kernel/power/swsusp.c        | 110 ++++++++++++++-----------------------------
 6 files changed, 65 insertions(+), 150 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 02516823f514..fd2bef780882 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -147,57 +147,7 @@ extern int restore_image(void);
 
 pgd_t *temp_level4_pgt;
 
-static void **pages;
-
-static inline void *__add_page(void)
-{
-	void **c;
-
-	c = (void **)get_usable_page(GFP_ATOMIC);
-	if (c) {
-		*c = pages;
-		pages = c;
-	}
-	return c;
-}
-
-static inline void *__next_page(void)
-{
-	void **c;
-
-	c = pages;
-	if (c) {
-		pages = *c;
-		*c = NULL;
-	}
-	return c;
-}
-
-/*
- * Try to allocate as many usable pages as needed and daisy chain them.
- * If one allocation fails, free the pages allocated so far
- */
-static int alloc_usable_pages(unsigned long n)
-{
-	void *p;
-
-	pages = NULL;
-	do
-		if (!__add_page())
-			break;
-	while (--n);
-	if (n) {
-		p = __next_page();
-		while (p) {
-			free_page((unsigned long)p);
-			p = __next_page();
-		}
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
 {
 	long i, j;
 
@@ -211,7 +161,9 @@ static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long e
 		if (paddr >= end)
 			break;
 
-		pmd = (pmd_t *)__next_page();
+		pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+		if (!pmd)
+			return -ENOMEM;
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
 			unsigned long pe;
@@ -223,13 +175,17 @@ static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long e
 			set_pmd(pmd, __pmd(pe));
 		}
 	}
+	return 0;
 }
 
-static void set_up_temporary_mappings(void)
+static int set_up_temporary_mappings(void)
 {
 	unsigned long start, end, next;
+	int error;
 
-	temp_level4_pgt = (pgd_t *)__next_page();
+	temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+	if (!temp_level4_pgt)
+		return -ENOMEM;
 
 	/* It is safe to reuse the original kernel mapping */
 	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
@@ -240,29 +196,27 @@ static void set_up_temporary_mappings(void)
 	end = (unsigned long)pfn_to_kaddr(end_pfn);
 
 	for (; start < end; start = next) {
-		pud_t *pud = (pud_t *)__next_page();
+		pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+		if (!pud)
+			return -ENOMEM;
 		next = start + PGDIR_SIZE;
 		if (next > end)
 			next = end;
-		res_phys_pud_init(pud, __pa(start), __pa(next));
+		if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+			return error;
 		set_pgd(temp_level4_pgt + pgd_index(start),
 			mk_kernel_pgd(__pa(pud)));
 	}
+	return 0;
 }
 
 int swsusp_arch_resume(void)
 {
-	unsigned long n;
+	int error;
 
-	n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT;
-	n += (n + PTRS_PER_PUD - 1) / PTRS_PER_PUD + 1;
-	pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n);
-	if (alloc_usable_pages(n)) {
-		free_eaten_memory();
-		return -ENOMEM;
-	}
 	/* We have got enough memory and from now on we cannot recover */
-	set_up_temporary_mappings();
+	if ((error = set_up_temporary_mappings()))
+		return error;
 	restore_image();
 	return 0;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 380915e9563d..a61c04f804b2 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,8 +71,7 @@ void restore_processor_state(void);
 struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
-extern unsigned long get_usable_page(gfp_t gfp_mask);
-extern void free_eaten_memory(void);
+unsigned long get_safe_page(gfp_t gfp_mask);
 
 /*
  * XXX: We try to keep some more pages free so that I/O operations succeed
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 761956e813f5..44ef5e799df0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
 extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
-extern int swsusp_free(void);
 
 
 static int noresume = 0;
@@ -252,14 +251,17 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read()))
-		goto Cleanup;
+	if ((error = swsusp_read())) {
+		swsusp_free();
+		goto Thaw;
+	}
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
 	if ((error = device_suspend(PMSG_FREEZE))) {
 		printk("Some devices failed to suspend\n");
-		goto Free;
+		swsusp_free();
+		goto Thaw;
 	}
 
 	mb();
@@ -268,9 +270,7 @@ static int software_resume(void)
 	swsusp_resume();
 	pr_debug("PM: Restore failed, recovering.n");
 	device_resume();
- Free:
-	swsusp_free();
- Cleanup:
+ Thaw:
 	unprepare_processes();
  Done:
 	/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 28afcb090149..d4fd96a135ab 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -66,7 +66,7 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern int restore_highmem(void);
-extern void free_pagedir(struct pbe *pblist);
 extern struct pbe * alloc_pagedir(unsigned nr_pages);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern void swsusp_free(void);
 extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 03916cf3ff02..84e686bdb40b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -240,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f6abfdb0a02a..50667f4f3a2b 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -629,6 +629,11 @@ int swsusp_resume(void)
 	 * execution continues at place where swsusp_arch_suspend was called
          */
 	BUG_ON(!error);
+	/* The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
 	restore_processor_state();
 	restore_highmem();
 	touch_softlockup_watchdog();
@@ -644,54 +649,28 @@ int swsusp_resume(void)
  *
  *	We don't know which pages are usable until we allocate them.
  *
- *	Allocated but unusable (ie eaten) memory pages are linked together
- *	to create a list, so that we can free them easily
- *
- *	We could have used a type other than (void *)
- *	for this purpose, but ...
+ *	Allocated but unusable (ie eaten) memory pages are marked so that
+ *	swsusp_free() can release them
  */
-static void **eaten_memory = NULL;
 
-static inline void eat_page(void *page)
-{
-	void **c;
-
-	c = eaten_memory;
-	eaten_memory = page;
-	*eaten_memory = c;
-}
-
-unsigned long get_usable_page(gfp_t gfp_mask)
+unsigned long get_safe_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-	m = get_zeroed_page(gfp_mask);
-	while (!PageNosaveFree(virt_to_page(m))) {
-		eat_page((void *)m);
+	do {
 		m = get_zeroed_page(gfp_mask);
-		if (!m)
-			break;
+		if (m && PageNosaveFree(virt_to_page(m)))
+			/* This is for swsusp_free() */
+			SetPageNosave(virt_to_page(m));
+	} while (m && PageNosaveFree(virt_to_page(m)));
+	if (m) {
+		/* This is for swsusp_free() */
+		SetPageNosave(virt_to_page(m));
+		SetPageNosaveFree(virt_to_page(m));
 	}
 	return m;
 }
 
-void free_eaten_memory(void)
-{
-	unsigned long m;
-	void **c;
-	int i = 0;
-
-	c = eaten_memory;
-	while (c) {
-		m = (unsigned long)c;
-		c = *c;
-		free_page(m);
-		i++;
-	}
-	eaten_memory = NULL;
-	pr_debug("swsusp: %d unused pages freed\n", i);
-}
-
 /**
  *	check_pagedir - We ensure here that pages that the PBEs point to
  *	won't collide with pages where we're going to restore from the loaded
@@ -709,7 +688,7 @@ static int check_pagedir(struct pbe *pblist)
 		p->address = 0UL;
 
 	for_each_pbe (p, pblist) {
-		p->address = get_usable_page(GFP_ATOMIC);
+		p->address = get_safe_page(GFP_ATOMIC);
 		if (!p->address)
 			return -ENOMEM;
 	}
@@ -728,7 +707,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	unsigned long zone_pfn;
 	struct pbe *pbpage, *tail, *p;
 	void *m;
-	int rel = 0, error = 0;
+	int rel = 0;
 
 	if (!pblist) /* a sanity check */
 		return NULL;
@@ -736,41 +715,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
 			swsusp_info.pagedir_pages);
 
-	/* Set page flags */
+	/* Clear page flags */
 
 	for_each_zone (zone) {
         	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                	SetPageNosaveFree(pfn_to_page(zone_pfn +
+        		if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                		ClearPageNosaveFree(pfn_to_page(zone_pfn +
 					zone->zone_start_pfn));
 	}
 
-	/* Clear orig addresses */
+	/* Mark orig addresses */
 
 	for_each_pbe (p, pblist)
-		ClearPageNosaveFree(virt_to_page(p->orig_address));
+		SetPageNosaveFree(virt_to_page(p->orig_address));
 
 	tail = pblist + PB_PAGE_SKIP;
 
 	/* Relocate colliding pages */
 
 	for_each_pb_page (pbpage, pblist) {
-		if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
-			if (!m) {
-				error = -ENOMEM;
-				break;
-			}
+		if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
+			m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m)
+				return NULL;
 			memcpy(m, (void *)pbpage, PAGE_SIZE);
 			if (pbpage == pblist)
 				pblist = (struct pbe *)m;
 			else
 				tail->next = (struct pbe *)m;
-
-			eat_page((void *)pbpage);
 			pbpage = (struct pbe *)m;
 
 			/* We have to link the PBEs again */
-
 			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
 				if (p->next) /* needed to save the end */
 					p->next = p + 1;
@@ -780,15 +755,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 		tail = pbpage + PB_PAGE_SKIP;
 	}
 
-	if (error) {
-		printk("\nswsusp: Out of memory\n\n");
-		free_pagedir(pblist);
-		free_eaten_memory();
-		pblist = NULL;
-		/* Is this even worth handling? It should never ever happen, and we
-		   have just lost user's state, anyway... */
-	} else
-		printk("swsusp: Relocated %d pages\n", rel);
+	/* This is for swsusp_free() */
+	for_each_pb_page (pbpage, pblist) {
+		SetPageNosave(virt_to_page(pbpage));
+		SetPageNosaveFree(virt_to_page(pbpage));
+	}
+
+	printk("swsusp: Relocated %d pages\n", rel);
 
 	return pblist;
 }
@@ -1006,9 +979,7 @@ static int read_pagedir(struct pbe *pblist)
 			break;
 	}
 
-	if (error)
-		free_pagedir(pblist);
-	else
+	if (!error)
 		BUG_ON(i != swsusp_info.pagedir_pages);
 
 	return error;
@@ -1051,15 +1022,6 @@ static int read_suspend_image(void)
 	if (!error)
 		error = data_read(pagedir_nosave);
 
-	if (error) { /* We fail cleanly */
-		free_eaten_memory();
-		for_each_pbe (p, pagedir_nosave)
-			if (p->address) {
-				free_page(p->address);
-				p->address = 0UL;
-			}
-		free_pagedir(pagedir_nosave);
-	}
 	return error;
 }
 
-- 
cgit v1.2.3


From 96bc7aec20b50761822f96130127b8e31e168af1 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: remove unneccessary includes

Cleanup comments and remove unneccessary includes.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 25 ++-----------------------
 kernel/power/swsusp.c   |  9 +--------
 2 files changed, 3 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 84e686bdb40b..b4a923e59bc5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,8 +1,7 @@
 /*
- * linux/kernel/power/swsusp.c
+ * linux/kernel/power/snapshot.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provide system snapshot/restore functionality.
  *
  * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
  *
@@ -15,30 +14,16 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
 #include <linux/pm.h>
 #include <linux/device.h>
-#include <linux/buffer_head.h>
-#include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -46,15 +31,9 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
 #include "power.h"
 
 
-
-
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 50667f4f3a2b..ae8425b69b44 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,8 +1,7 @@
 /*
  * linux/kernel/power/swsusp.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provides code to write suspend image to swap and read it back.
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
  * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
@@ -47,11 +46,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
@@ -63,10 +58,8 @@
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
-- 
cgit v1.2.3


From de491861e1457c31aed6d44d96afb549365ff790 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 30 Oct 2005 14:59:59 -0800
Subject: [PATCH] swsusp: cleanups

Reduce number of ifdefs somehow, and fix whitespace a bit.  No real code
changes.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b4a923e59bc5..72787f925630 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,7 +33,6 @@
 
 #include "power.h"
 
-
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -88,12 +87,10 @@ static int save_highmem_zone(struct zone *zone)
 	}
 	return 0;
 }
-#endif /* CONFIG_HIGHMEM */
 
 
 static int save_highmem(void)
 {
-#ifdef CONFIG_HIGHMEM
 	struct zone *zone;
 	int res = 0;
 
@@ -104,13 +101,11 @@ static int save_highmem(void)
 		if (res)
 			return res;
 	}
-#endif
 	return 0;
 }
 
 int restore_highmem(void)
 {
-#ifdef CONFIG_HIGHMEM
 	printk("swsusp: Restoring Highmem\n");
 	while (highmem_copy) {
 		struct highmem_page *save = highmem_copy;
@@ -123,9 +118,12 @@ int restore_highmem(void)
 		free_page((long) save->data);
 		kfree(save);
 	}
-#endif
 	return 0;
 }
+#else
+static int save_highmem(void) { return 0; }
+int restore_highmem(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 
 
 static int pfn_is_nosave(unsigned long pfn)
@@ -144,10 +142,10 @@ static int pfn_is_nosave(unsigned long pfn)
  *	isn't part of a free chunk of pages.
  */
 
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
+static int saveable(struct zone *zone, unsigned long *zone_pfn)
 {
 	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
+	struct page *page;
 
 	if (!pfn_valid(pfn))
 		return 0;
@@ -201,7 +199,7 @@ static void copy_data_pages(struct pbe *pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
+				struct page *page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
@@ -295,7 +293,7 @@ static void *alloc_image_page(void)
  *	On each page we set up a list of struct_pbe elements.
  */
 
-struct pbe * alloc_pagedir(unsigned nr_pages)
+struct pbe *alloc_pagedir(unsigned nr_pages)
 {
 	unsigned num;
 	struct pbe *pblist, *pbe;
@@ -304,12 +302,12 @@ struct pbe * alloc_pagedir(unsigned nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)alloc_image_page();
+	pblist = alloc_image_page();
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)alloc_image_page();
+		pbe->next = alloc_image_page();
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
-- 
cgit v1.2.3


From 2e32a43efdc8175579cc91e8b620ac331376a437 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 15:00:00 -0800
Subject: [PATCH] swsusp: get rid of unnecessary wrapper function

The following patch merges two functions in a trivial way.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72787f925630..42a628704398 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -378,7 +378,7 @@ static struct pbe *swsusp_alloc(unsigned nr_pages)
 	return pblist;
 }
 
-static int suspend_prepare_image(void)
+asmlinkage int swsusp_save(void)
 {
 	unsigned nr_pages;
 
@@ -433,9 +433,3 @@ static int suspend_prepare_image(void)
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
-- 
cgit v1.2.3


From 0245b3e787dc3267a915e1f56419e7e9c197e148 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 15:00:01 -0800
Subject: [PATCH] swsusp: two simplifications

The following patch simplifies the progress meter in disk.c:free_some_memory()
and makes disk.c:pm_suspend_disk() call device_resume() explicitly in the
suspend path.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 8 ++------
 kernel/power/swsusp.c | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 44ef5e799df0..027322a564f4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -92,10 +92,7 @@ static void free_some_memory(void)
 	printk("Freeing memory...  ");
 	while ((tmp = shrink_all_memory(10000))) {
 		pages += tmp;
-		printk("\b%c", p[i]);
-		i++;
-		if (i > 3)
-			i = 0;
+		printk("\b%c", p[i++ % 4]);
 	}
 	printk("\bdone (%li pages freed)\n", pages);
 }
@@ -177,13 +174,12 @@ int pm_suspend_disk(void)
 		goto Done;
 
 	if (in_suspend) {
+		device_resume();
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
-		/* swsusp_write can not fail in device_resume,
-		   no need to do second device_resume */
 			swsusp_free();
 			unprepare_processes();
 			return error;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae8425b69b44..12db1d2ad61f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -562,7 +562,7 @@ int enough_swap(unsigned nr_pages)
 int swsusp_write(void)
 {
 	int error;
-	device_resume();
+
 	lock_swapdevices();
 	error = write_suspend_image();
 	/* This will unlock ignored swap devices since writing is finished */
-- 
cgit v1.2.3


From eb9289eb20df6b54214c45ac7c6bf5179a149026 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sun, 30 Oct 2005 15:00:01 -0800
Subject: [PATCH] introduce .valid callback for pm_ops

Add pm_ops.valid callback, so only the available pm states show in
/sys/power/state.  And this also makes an earlier states error report at
enter_state before we do actual suspend/resume.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Acked-by: Pavel Machek<pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/acpi/sleep/main.c | 8 ++++++++
 include/linux/pm.h        | 1 +
 kernel/power/main.c       | 5 ++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index aee50b453265..930427fc0c4b 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -158,7 +158,15 @@ int acpi_suspend(u32 acpi_state)
 	return -EINVAL;
 }
 
+static int acpi_pm_state_valid(suspend_state_t pm_state)
+{
+	u32 acpi_state = acpi_suspend_states[pm_state];
+
+	return sleep_states[acpi_state];
+}
+
 static struct pm_ops acpi_pm_ops = {
+	.valid = acpi_pm_state_valid,
 	.prepare = acpi_pm_prepare,
 	.enter = acpi_pm_enter,
 	.finish = acpi_pm_finish,
diff --git a/include/linux/pm.h b/include/linux/pm.h
index c61d5de837ef..1514098d156d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -170,6 +170,7 @@ typedef int __bitwise suspend_disk_method_t;
 
 struct pm_ops {
 	suspend_disk_method_t pm_disk_mode;
+	int (*valid)(suspend_state_t state);
 	int (*prepare)(suspend_state_t state);
 	int (*enter)(suspend_state_t state);
 	int (*finish)(suspend_state_t state);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 22bdc93cc038..18d7d693fbba 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state)
 {
 	int error;
 
+	if (pm_ops->valid && !pm_ops->valid(state))
+		return -ENODEV;
 	if (down_trylock(&pm_sem))
 		return -EBUSY;
 
@@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 	char * s = buf;
 
 	for (i = 0; i < PM_SUSPEND_MAX; i++) {
-		if (pm_states[i])
+		if (pm_states[i] && pm_ops && (!pm_ops->valid
+			||(pm_ops->valid && pm_ops->valid(i))))
 			s += sprintf(s,"%s ",pm_states[i]);
 	}
 	s += sprintf(s,"\n");
-- 
cgit v1.2.3


From a8db2db1e6a8d323d87a67c5391d48fe2b97faf5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:01:38 -0800
Subject: [PATCH] introduce setup_timer() helper

Every user of init_timer() also needs to initialize ->function and ->data
fields.  This patch adds a simple setup_timer() helper for that.

The schedule_timeout() is patched as an example of usage.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 8 ++++----
 include/linux/timer.h      | 9 +++++++++
 kernel/timer.c             | 8 ++------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index b2a238b5a17e..c6c9791d77c1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -494,7 +494,7 @@ void invalidate_interrupt7(void);
 void thermal_interrupt(void);
 void i8254_timer_resume(void);
 
-static void setup_timer(void)
+static void setup_timer_hardware(void)
 {
 	outb_p(0x34,0x43);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -505,13 +505,13 @@ static void setup_timer(void)
 
 static int timer_resume(struct sys_device *dev)
 {
-	setup_timer();
+	setup_timer_hardware();
 	return 0;
 }
 
 void i8254_timer_resume(void)
 {
-	setup_timer();
+	setup_timer_hardware();
 }
 
 static struct sysdev_class timer_sysclass = {
@@ -594,7 +594,7 @@ void __init init_IRQ(void)
 	 * Set the clock to HZ Hz, we already have a valid
 	 * vector now:
 	 */
-	setup_timer();
+	setup_timer_hardware();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 3340f3bd135d..ddd5bbe1fc8e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -38,6 +38,15 @@ extern struct timer_base_s __init_timer_base;
 
 void fastcall init_timer(struct timer_list * timer);
 
+static inline void setup_timer(struct timer_list * timer,
+				void (*function)(unsigned long),
+				unsigned long data)
+{
+	timer->function = function;
+	timer->data = data;
+	init_timer(timer);
+}
+
 /***
  * timer_pending - is a timer pending?
  * @timer: the timer in question
diff --git a/kernel/timer.c b/kernel/timer.c
index 6a2e5f8dc725..6ed1a826e5ce 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1146,12 +1146,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	init_timer(&timer);
-	timer.expires = expire;
-	timer.data = (unsigned long) current;
-	timer.function = process_timeout;
-
-	add_timer(&timer);
+	setup_timer(&timer, process_timeout, (unsigned long)current);
+	__mod_timer(&timer, expire);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3


From 61e1a9ea4b425eb8c3b4965c35fe953bd881728f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 30 Oct 2005 15:01:40 -0800
Subject: [PATCH] Add kthread_stop_sem()

Enhance the kthread API by adding kthread_stop_sem, for use in stopping
threads that spend their idle time waiting on a semaphore.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kthread.h | 12 ++++++++++++
 kernel/kthread.c        | 13 +++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 3fa786448db3..ebdd41fd1082 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -69,6 +69,18 @@ void kthread_bind(struct task_struct *k, unsigned int cpu);
  * was never called. */
 int kthread_stop(struct task_struct *k);
 
+/**
+ * kthread_stop_sem: stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ * @s: semaphore that @k waits on while idle.
+ *
+ * Does essentially the same thing as kthread_stop() above, but wakes
+ * @k by calling up(@s).
+ *
+ * Returns the result of threadfn(), or -EINTR if wake_up_process()
+ * was never called. */
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s);
+
 /**
  * kthread_should_stop: should this kthread return now?
  *
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92da..e75950a1092c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -164,6 +164,12 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 EXPORT_SYMBOL(kthread_bind);
 
 int kthread_stop(struct task_struct *k)
+{
+	return kthread_stop_sem(k, NULL);
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	wake_up_process(k);
+	if (s)
+		up(s);
+	else
+		wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop);
+EXPORT_SYMBOL(kthread_stop_sem);
 
 static __init int helper_init(void)
 {
-- 
cgit v1.2.3


From 1bb34a412750291e4e5e9f1d0fe7ae1b7e976098 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Sun, 30 Oct 2005 15:01:42 -0800
Subject: [PATCH] NTP shift_right cleanup

Create a macro shift_right() that avoids the numerous ugly conditionals in the
NTP code that look like:

        if(a < 0)
                b = -(-a >> shift);
        else
                b = a >> shift;

Replacing it with:

        b = shift_right(a, shift);

This should have zero effect on the logic, however it should probably have
a bit of testing just to be sure.

Also replace open-coded min/max with the macros.

Signed-off-by : John Stultz <johnstul@us.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  7 +++++++
 kernel/time.c         | 25 ++++++------------------
 kernel/timer.c        | 53 +++++++++++----------------------------------------
 3 files changed, 24 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 7e050a2cc35b..04a4a8cb4ed3 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -282,6 +282,13 @@ static inline int ntp_synced(void)
 	return !(time_status & STA_UNSYNC);
 }
 
+/* Required to safely shift negative values */
+#define shift_right(x, s) ({	\
+	__typeof__(x) __x = (x);	\
+	__typeof__(s) __s = (s);	\
+	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
+})
+
 
 #ifdef CONFIG_TIME_INTERPOLATION
 
diff --git a/kernel/time.c b/kernel/time.c
index a3c2100470e1..245d595a13cb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
 		        if (mtemp >= MINSEC) {
 			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
 							      SHIFT_UPDATE);
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> SHIFT_KH;
-			    else
-			        time_freq += ltemp >> SHIFT_KH;
+			    time_freq += shift_right(ltemp, SHIFT_KH);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
 		    } else {	/* PLL mode */
 		        if (mtemp < MAXSEC) {
 			    ltemp *= mtemp;
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> (time_constant +
-							time_constant +
-							SHIFT_KF - SHIFT_USEC);
-			    else
-			        time_freq += ltemp >> (time_constant +
+			    time_freq += shift_right(ltemp,(time_constant +
 						       time_constant +
-						       SHIFT_KF - SHIFT_USEC);
+						       SHIFT_KF - SHIFT_USEC));
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    if (time_freq > time_tolerance)
-		        time_freq = time_tolerance;
-		    else if (time_freq < -time_tolerance)
-		        time_freq = -time_tolerance;
+		    time_freq = min(time_freq, time_tolerance);
+		    time_freq = max(time_freq, -time_tolerance);
 		} /* STA_PLL || STA_PPSTIME */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
 	    txc->offset	   = save_adjust;
 	else {
-	    if (time_offset < 0)
-		txc->offset = -(-time_offset >> SHIFT_UPDATE);
-	    else
-		txc->offset = time_offset >> SHIFT_UPDATE;
+	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
 	}
 	txc->freq	   = time_freq + pps_freq;
 	txc->maxerror	   = time_maxerror;
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ed1a826e5ce..6b94adb45b03 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -703,23 +703,13 @@ static void second_overflow(void)
      * the adjustment over not more than the number of
      * seconds between updates.
      */
-    if (time_offset < 0) {
-	ltemp = -time_offset;
-	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
-	time_offset += ltemp;
-	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    } else {
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    }
 
     /*
      * Compute the frequency estimate and additional phase
@@ -736,39 +726,25 @@ static void second_overflow(void)
 			 STA_PPSWANDER | STA_PPSERROR);
     }
     ltemp = time_freq + pps_freq;
-    if (ltemp < 0)
-	time_adj -= -ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-    else
-	time_adj += ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+    time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
     /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
      * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
-    else
-	time_adj += (time_adj >> 2) + (time_adj >> 5);
+    time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
     /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -787,10 +763,8 @@ static void update_wall_time_one_tick(void)
 	     * Limit the amount of the step to be in the range
 	     * -tickadj .. +tickadj
 	     */
-	     if (time_adjust > tickadj)
-		time_adjust_step = tickadj;
-	     else if (time_adjust < -tickadj)
-		time_adjust_step = -tickadj;
+	     time_adjust_step = min(time_adjust_step, (long)tickadj);
+	     time_adjust_step = max(time_adjust_step, (long)-tickadj);
 
 	    /* Reduce by this step the amount of time left  */
 	    time_adjust -= time_adjust_step;
@@ -801,13 +775,8 @@ static void update_wall_time_one_tick(void)
 	 * advance the tick more.
 	 */
 	time_phase += time_adj;
-	if (time_phase <= -FINENSEC) {
-		long ltemp = -time_phase >> (SHIFT_SCALE - 10);
-		time_phase += ltemp << (SHIFT_SCALE - 10);
-		delta_nsec -= ltemp;
-	}
-	else if (time_phase >= FINENSEC) {
-		long ltemp = time_phase >> (SHIFT_SCALE - 10);
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
 		time_phase -= ltemp << (SHIFT_SCALE - 10);
 		delta_nsec += ltemp;
 	}
-- 
cgit v1.2.3


From a5a0d52c7305cb3629ef0cc9e2e0e106869e1907 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 30 Oct 2005 15:01:42 -0800
Subject: [PATCH] ntp whitespace cleanup

Fix bizarre 4-space coding style in the NTP code.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 252 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 126 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6b94adb45b03..cc18857601e2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -632,77 +632,74 @@ long time_next_adjust;
  */
 static void second_overflow(void)
 {
-    long ltemp;
-
-    /* Bump the maxerror field */
-    time_maxerror += time_tolerance >> SHIFT_USEC;
-    if ( time_maxerror > NTP_PHASE_LIMIT ) {
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_status |= STA_UNSYNC;
-    }
-
-    /*
-     * Leap second processing. If in leap-insert state at
-     * the end of the day, the system clock is set back one
-     * second; if in leap-delete state, the system clock is
-     * set ahead one second. The microtime() routine or
-     * external clock driver will insure that reported time
-     * is always monotonic. The ugly divides should be
-     * replaced.
-     */
-    switch (time_state) {
-
-    case TIME_OK:
-	if (time_status & STA_INS)
-	    time_state = TIME_INS;
-	else if (time_status & STA_DEL)
-	    time_state = TIME_DEL;
-	break;
-
-    case TIME_INS:
-	if (xtime.tv_sec % 86400 == 0) {
-	    xtime.tv_sec--;
-	    wall_to_monotonic.tv_sec++;
-	    /* The timer interpolator will make time change gradually instead
-	     * of an immediate jump by one second.
-	     */
-	    time_interpolator_update(-NSEC_PER_SEC);
-	    time_state = TIME_OOP;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
 	}
-	break;
-
-    case TIME_DEL:
-	if ((xtime.tv_sec + 1) % 86400 == 0) {
-	    xtime.tv_sec++;
-	    wall_to_monotonic.tv_sec--;
-	    /* Use of time interpolator for a gradual change of time */
-	    time_interpolator_update(NSEC_PER_SEC);
-	    time_state = TIME_WAIT;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
 	}
-	break;
-
-    case TIME_OOP:
-	time_state = TIME_WAIT;
-	break;
-
-    case TIME_WAIT:
-	if (!(time_status & (STA_INS | STA_DEL)))
-	    time_state = TIME_OK;
-    }
-
-    /*
-     * Compute the phase adjustment for the next second. In
-     * PLL mode, the offset is reduced by a fixed factor
-     * times the time constant. In FLL mode the offset is
-     * used directly. In either mode, the maximum phase
-     * adjustment for each second is clamped so as to spread
-     * the adjustment over not more than the number of
-     * seconds between updates.
-     */
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
 		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
@@ -711,40 +708,42 @@ static void second_overflow(void)
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 
-    /*
-     * Compute the frequency estimate and additional phase
-     * adjustment due to frequency error for the next
-     * second. When the PPS signal is engaged, gnaw on the
-     * watchdog counter and update the frequency computed by
-     * the pll and the PPS signal.
-     */
-    pps_valid++;
-    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-	pps_jitter = MAXTIME;
-	pps_stabil = MAXFREQ;
-	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-			 STA_PPSWANDER | STA_PPSERROR);
-    }
-    ltemp = time_freq + pps_freq;
-    time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second. When the PPS signal is
+	 * engaged, gnaw on the watchdog counter and update the frequency
+	 * computed by the pll and the PPS signal.
+	 */
+	pps_valid++;
+	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+		pps_jitter = MAXTIME;
+		pps_stabil = MAXFREQ;
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				STA_PPSWANDER | STA_PPSERROR);
+	}
+	ltemp = time_freq + pps_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
-     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
-    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
-    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -753,21 +752,20 @@ static void update_wall_time_one_tick(void)
 {
 	long time_adjust_step, delta_nsec;
 
-	if ( (time_adjust_step = time_adjust) != 0 ) {
-	    /* We are doing an adjtime thing. 
-	     *
-	     * Prepare time_adjust_step to be within bounds.
-	     * Note that a positive time_adjust means we want the clock
-	     * to run faster.
-	     *
-	     * Limit the amount of the step to be in the range
-	     * -tickadj .. +tickadj
-	     */
-	     time_adjust_step = min(time_adjust_step, (long)tickadj);
-	     time_adjust_step = max(time_adjust_step, (long)-tickadj);
-
-	    /* Reduce by this step the amount of time left  */
-	    time_adjust -= time_adjust_step;
+	if ((time_adjust_step = time_adjust) != 0 ) {
+		/*
+		 * We are doing an adjtime thing.  Prepare time_adjust_step to
+		 * be within bounds.  Note that a positive time_adjust means we
+		 * want the clock to run faster.
+		 *
+		 * Limit the amount of the step to be in the range
+		 * -tickadj .. +tickadj
+		 */
+		time_adjust_step = min(time_adjust_step, (long)tickadj);
+		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+
+		/* Reduce by this step the amount of time left  */
+		time_adjust -= time_adjust_step;
 	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
@@ -1106,8 +1104,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
+				"value %lx from %p\n", timeout,
+				__builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
@@ -1133,15 +1131,15 @@ EXPORT_SYMBOL(schedule_timeout);
  */
 signed long __sched schedule_timeout_interruptible(signed long timeout)
 {
-       __set_current_state(TASK_INTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_interruptible);
 
 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 {
-       __set_current_state(TASK_UNINTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
@@ -1481,16 +1479,18 @@ static void time_interpolator_update(long delta_nsec)
 	if (!time_interpolator)
 		return;
 
-	/* The interpolator compensates for late ticks by accumulating
-         * the late time in time_interpolator->offset. A tick earlier than
-	 * expected will lead to a reset of the offset and a corresponding
-	 * jump of the clock forward. Again this only works if the
-	 * interpolator clock is running slightly slower than the regular clock
-	 * and the tuning logic insures that.
-         */
+	/*
+	 * The interpolator compensates for late ticks by accumulating the late
+	 * time in time_interpolator->offset. A tick earlier than expected will
+	 * lead to a reset of the offset and a corresponding jump of the clock
+	 * forward. Again this only works if the interpolator clock is running
+	 * slightly slower than the regular clock and the tuning logic insures
+	 * that.
+	 */
 
 	counter = time_interpolator_get_counter(1);
-	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+	offset = time_interpolator->offset +
+			GET_TI_NSECS(counter, time_interpolator);
 
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
 		time_interpolator->offset = offset - delta_nsec;
-- 
cgit v1.2.3


From 89ada67917f516212452443a56b9fd3b65b74dc7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 30 Oct 2005 15:01:59 -0800
Subject: [PATCH] Use alloc_percpu to allocate workqueues locally

This patch makes the workqueus use alloc_percpu instead of an array.  The
workqueues are placed on nodes local to each processor.

The workqueue structure can grow to a significant size on a system with
lots of processors if this patch is not applied.  64 bit architectures with
all debugging features enabled and configured for 512 processors will not
be able to boot without this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 91bacb13a7e2..7cee222231bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
  *   Andrew Morton <andrewm@uow.edu.au>
  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  *   Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
  */
 
 #include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+	struct cpu_workqueue_struct *cpu_wq;
 	const char *name;
 	struct list_head list; 	/* Empty if single thread */
 };
@@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 		if (unlikely(is_single_threaded(wq)))
 			cpu = 0;
 		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq->cpu_wq + cpu, work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
 	put_cpu();
@@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	if (unlikely(is_single_threaded(wq)))
 		cpu = 0;
 
-	__queue_work(wq->cpu_wq + cpu, work);
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 
 	if (is_single_threaded(wq)) {
 		/* Always use cpu 0's area. */
-		flush_cpu_workqueue(wq->cpu_wq + 0);
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0));
 	} else {
 		int cpu;
 
 		lock_cpu_hotplug();
 		for_each_online_cpu(cpu)
-			flush_cpu_workqueue(wq->cpu_wq + cpu);
+			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 		unlock_cpu_hotplug();
 	}
 }
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 						   int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
 
 	spin_lock_init(&cwq->lock);
@@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	if (!wq)
 		return NULL;
 
+	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 	unsigned long flags;
 	struct task_struct *p;
 
-	cwq = wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	spin_lock_irqsave(&cwq->lock, flags);
 	p = cwq->thread;
 	cwq->thread = NULL;
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		spin_unlock(&workqueue_lock);
 	}
 	unlock_cpu_hotplug();
+	free_percpu(wq->cpu_wq);
 	kfree(wq);
 }
 
@@ -458,7 +462,7 @@ int current_is_keventd(void)
 
 	BUG_ON(!keventd_wq);
 
-	cwq = keventd_wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
 	if (current == cwq->thread)
 		ret = 1;
 
@@ -470,7 +474,7 @@ int current_is_keventd(void)
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	LIST_HEAD(list);
 	struct work_struct *work;
 
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 		printk("Taking work for %s\n", wq->name);
 		work = list_entry(list.next,struct work_struct,entry);
 		list_del(&work->entry);
-		__queue_work(wq->cpu_wq + smp_processor_id(), work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
 	}
 	spin_unlock_irq(&cwq->lock);
 }
@@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 		/* Kick off worker threads. */
 		list_for_each_entry(wq, &workqueues, list) {
-			kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
-			wake_up_process(wq->cpu_wq[hotcpu].thread);
+			struct cpu_workqueue_struct *cwq;
+
+			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+			kthread_bind(cwq->thread, hotcpu);
+			wake_up_process(cwq->thread);
 		}
 		break;
 
 	case CPU_UP_CANCELED:
 		list_for_each_entry(wq, &workqueues, list) {
 			/* Unbind so it can run. */
-			kthread_bind(wq->cpu_wq[hotcpu].thread,
+			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
 				     smp_processor_id());
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
-- 
cgit v1.2.3


From dfc4f94d2ff95fc92127d3e512c1df7cab274fb8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 30 Oct 2005 15:02:03 -0800
Subject: [PATCH] remove timer debug field

Remove timer_list.magic and associated debugging code.

I originally added this when a spinlock was added to timer_list - this meant
that an all-zeroes timer became illegal and init_timer() was required.

That spinlock isn't even there any more, although timer.base must now be
initialised.

I'll keep this debugging code in -mm.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/vtime.c | 18 ++----------------
 include/linux/timer.h    |  5 -----
 kernel/timer.c           | 35 -----------------------------------
 3 files changed, 2 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index fa0726507b3d..22a895ecb7a4 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -24,7 +24,6 @@
 #include <asm/s390_ext.h>
 #include <asm/timer.h>
 
-#define VTIMER_MAGIC (TIMER_MAGIC + 1)
 static ext_int_info_t ext_int_info_timer;
 DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
 
@@ -277,20 +276,12 @@ static void do_cpu_timer_interrupt(struct pt_regs *regs, __u16 error_code)
 
 void init_virt_timer(struct vtimer_list *timer)
 {
-	timer->magic = VTIMER_MAGIC;
 	timer->function = NULL;
 	INIT_LIST_HEAD(&timer->entry);
 	spin_lock_init(&timer->lock);
 }
 EXPORT_SYMBOL(init_virt_timer);
 
-static inline int check_vtimer(struct vtimer_list *timer)
-{
-	if (timer->magic != VTIMER_MAGIC)
-		return -EINVAL;
-	return 0;
-}
-
 static inline int vtimer_pending(struct vtimer_list *timer)
 {
 	return (!list_empty(&timer->entry));
@@ -346,7 +337,7 @@ static void internal_add_vtimer(struct vtimer_list *timer)
 
 static inline int prepare_vtimer(struct vtimer_list *timer)
 {
-	if (check_vtimer(timer) || !timer->function) {
+	if (!timer->function) {
 		printk("add_virt_timer: uninitialized timer\n");
 		return -EINVAL;
 	}
@@ -414,7 +405,7 @@ int mod_virt_timer(struct vtimer_list *timer, __u64 expires)
 	unsigned long flags;
 	int cpu;
 
-	if (check_vtimer(timer) || !timer->function) {
+	if (!timer->function) {
 		printk("mod_virt_timer: uninitialized timer\n");
 		return	-EINVAL;
 	}
@@ -481,11 +472,6 @@ int del_virt_timer(struct vtimer_list *timer)
 	unsigned long flags;
 	struct vtimer_queue *vt_list;
 
-	if (check_vtimer(timer)) {
-		printk("del_virt_timer: timer not initialized\n");
-		return -EINVAL;
-	}
-
 	/* check if timer is pending */
 	if (!vtimer_pending(timer))
 		return 0;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ddd5bbe1fc8e..b1dc583bb4d4 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -12,16 +12,12 @@ struct timer_list {
 	struct list_head entry;
 	unsigned long expires;
 
-	unsigned long magic;
-
 	void (*function)(unsigned long);
 	unsigned long data;
 
 	struct timer_base_s *base;
 };
 
-#define TIMER_MAGIC	0x4b87ad6e
-
 extern struct timer_base_s __init_timer_base;
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
@@ -29,7 +25,6 @@ extern struct timer_base_s __init_timer_base;
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.base = &__init_timer_base,			\
-		.magic = TIMER_MAGIC,				\
 	}
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
diff --git a/kernel/timer.c b/kernel/timer.c
index cc18857601e2..562d53eabb46 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -91,30 +91,6 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
 
-static void check_timer_failed(struct timer_list *timer)
-{
-	static int whine_count;
-	if (whine_count < 16) {
-		whine_count++;
-		printk("Uninitialised timer!\n");
-		printk("This is just a warning.  Your computer is OK\n");
-		printk("function=0x%p, data=0x%lx\n",
-			timer->function, timer->data);
-		dump_stack();
-	}
-	/*
-	 * Now fix it up
-	 */
-	timer->magic = TIMER_MAGIC;
-}
-
-static inline void check_timer(struct timer_list *timer)
-{
-	if (timer->magic != TIMER_MAGIC)
-		check_timer_failed(timer);
-}
-
-
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
@@ -177,7 +153,6 @@ void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
-	timer->magic = TIMER_MAGIC;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -230,7 +205,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 	int ret = 0;
 
 	BUG_ON(!timer->function);
-	check_timer(timer);
 
 	base = lock_timer_base(timer, &flags);
 
@@ -283,9 +257,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
-
-	check_timer(timer);
-
 	spin_lock_irqsave(&base->t_base.lock, flags);
 	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
@@ -316,8 +287,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	BUG_ON(!timer->function);
 
-	check_timer(timer);
-
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -348,8 +317,6 @@ int del_timer(struct timer_list *timer)
 	unsigned long flags;
 	int ret = 0;
 
-	check_timer(timer);
-
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		if (timer_pending(timer)) {
@@ -412,8 +379,6 @@ out:
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	check_timer(timer);
-
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
-- 
cgit v1.2.3


From 19a4fcb531659f2f7d18b5d04cee039176e9540d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:02:17 -0800
Subject: [PATCH] kill sigqueue->lock

This lock is used in sigqueue_free(), but it is always equal to
current->sighand->siglock, so we don't need to keep it in the struct
sigqueue.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/signal.h |  1 -
 kernel/signal.c        | 10 ++++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 7be18b5e2fb4..5dd5f02c5c5f 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -25,7 +25,6 @@
 
 struct sigqueue {
 	struct list_head list;
-	spinlock_t *lock;
 	int flags;
 	siginfo_t info;
 	struct user_struct *user;
diff --git a/kernel/signal.c b/kernel/signal.c
index 6904bbbfe116..8d6e64dfa5c6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->lock = NULL;
 		q->user = get_uid(t->user);
 	}
 	return(q);
@@ -1371,11 +1370,12 @@ void sigqueue_free(struct sigqueue *q)
 	 * pending queue.
 	 */
 	if (unlikely(!list_empty(&q->list))) {
-		read_lock(&tasklist_lock);  
-		spin_lock_irqsave(q->lock, flags);
+		spinlock_t *lock = &current->sighand->siglock;
+		read_lock(&tasklist_lock);
+		spin_lock_irqsave(lock, flags);
 		if (!list_empty(&q->list))
 			list_del_init(&q->list);
-		spin_unlock_irqrestore(q->lock, flags);
+		spin_unlock_irqrestore(lock, flags);
 		read_unlock(&tasklist_lock);
 	}
 	q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1414,7 +1414,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		goto out;
 	}
 
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
@@ -1462,7 +1461,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 * We always use the shared queue for process-wide signals,
 	 * to avoid several races.
 	 */
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->signal->shared_pending.list);
 	sigaddset(&p->signal->shared_pending.signal, sig);
 
-- 
cgit v1.2.3


From 6dd69f1061bfdeca230509b173438e0731bff767 Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Sun, 30 Oct 2005 15:02:18 -0800
Subject: [PATCH] Unify sys_tkill() and sys_tgkill()

The majority of the sys_tkill() and sys_tgkill() function code is
duplicated between the two of them.  This patch pulls the duplication out
into a separate function -- do_tkill() -- and lets sys_tkill() and
sys_tgkill() be simple wrappers around it.  This should make it easier to
maintain in light of future changes.

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 71 ++++++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8d6e64dfa5c6..1d905ec74bde 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2283,26 +2283,13 @@ sys_kill(int pid, int sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-/**
- *  sys_tgkill - send signal to one specific thread
- *  @tgid: the thread group ID of the thread
- *  @pid: the PID of the thread
- *  @sig: signal to be sent
- *
- *  This syscall also checks the tgid and returns -ESRCH even if the PID
- *  exists but it's not belonging to the target process anymore. This
- *  method solves the problem of threads exiting and PIDs getting reused.
- */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+static int do_tkill(int tgid, int pid, int sig)
 {
-	struct siginfo info;
 	int error;
+	struct siginfo info;
 	struct task_struct *p;
 
-	/* This is only valid for single tasks */
-	if (pid <= 0 || tgid <= 0)
-		return -EINVAL;
-
+	error = -ESRCH;
 	info.si_signo = sig;
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
@@ -2311,8 +2298,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p && (p->tgid == tgid)) {
+	if (p && (tgid <= 0 || p->tgid == tgid)) {
 		error = check_kill_permission(sig, &info, p);
 		/*
 		 * The null signal is a permissions and process existence
@@ -2326,47 +2312,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 		}
 	}
 	read_unlock(&tasklist_lock);
+
 	return error;
 }
 
+/**
+ *  sys_tgkill - send signal to one specific thread
+ *  @tgid: the thread group ID of the thread
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *
+ *  This syscall also checks the tgid and returns -ESRCH even if the PID
+ *  exists but it's not belonging to the target process anymore. This
+ *  method solves the problem of threads exiting and PIDs getting reused.
+ */
+asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	return do_tkill(tgid, pid, sig);
+}
+
 /*
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 asmlinkage long
 sys_tkill(int pid, int sig)
 {
-	struct siginfo info;
-	int error;
-	struct task_struct *p;
-
 	/* This is only valid for single tasks */
 	if (pid <= 0)
 		return -EINVAL;
 
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = current->tgid;
-	info.si_uid = current->uid;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p) {
-		error = check_kill_permission(sig, &info, p);
-		/*
-		 * The null signal is a permissions and process existence
-		 * probe.  No signal is actually delivered.
-		 */
-		if (!error && sig && p->sighand) {
-			spin_lock_irq(&p->sighand->siglock);
-			handle_stop_signal(sig, p);
-			error = specific_send_sig_info(sig, &info, p);
-			spin_unlock_irq(&p->sighand->siglock);
-		}
-	}
-	read_unlock(&tasklist_lock);
-	return error;
+	return do_tkill(0, pid, sig);
 }
 
 asmlinkage long
-- 
cgit v1.2.3


From 4eb9af2a8a431a832830f986fead7332dab27229 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:02:21 -0800
Subject: [PATCH] posix-timers: use schedule_timeout() in common_nsleep()

common_nsleep() reimplements schedule_timeout_interruptible() for unknown
reason.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dda3cda73c77..ea55c7a1cd75 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
 	return error;
 }
 
-static void nanosleep_wake_up(unsigned long __data)
-{
-	struct task_struct *p = (struct task_struct *) __data;
-
-	wake_up_process(p);
-}
-
 /*
  * The standard says that an absolute nanosleep call MUST wake up at
  * the requested time in spite of clock settings.  Here is what we do:
@@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock,
 			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
-	struct timer_list new_timer;
 	DECLARE_WAITQUEUE(abs_wqueue, current);
 	u64 rq_time = (u64)0;
 	s64 left;
@@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock,
 	    &current_thread_info()->restart_block;
 
 	abs_wqueue.flags = 0;
-	init_timer(&new_timer);
-	new_timer.expires = 0;
-	new_timer.data = (unsigned long) current;
-	new_timer.function = nanosleep_wake_up;
 	abs = flags & TIMER_ABSTIME;
 
 	if (restart_block->fn == clock_nanosleep_restart) {
@@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock,
 		if (left < (s64)0)
 			break;
 
-		new_timer.expires = jiffies + left;
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_timer(&new_timer);
-
-		schedule();
+		schedule_timeout_interruptible(left);
 
-		del_timer_sync(&new_timer);
 		left = rq_time - get_jiffies_64();
 	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
 
-- 
cgit v1.2.3


From f35f31d7ed0150f9865619f21b5050c91b46c03f Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:27 -0800
Subject: [PATCH] cpuset cleanup

Remove one more useless line from cpuset_common_file_read().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 28176d083f7b..b9342f90d28f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -995,7 +995,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 		goto out;
 	}
 	*s++ = '\n';
-	*s = '\0';
 
 	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
 out:
-- 
cgit v1.2.3


From 5aa15b5f27fc2c404530c6c8eabdb8437deb3163 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:28 -0800
Subject: [PATCH] cpusets: remove depth counted locking hack

Remove a rather hackish depth counter on cpuset locking.  The depth counter
was avoiding a possible double trip on the global cpuset_sem semaphore.  It
worked, but now an improved version of cpuset locking is available, to come
in the next patch, using two global semaphores.

This patch reverses "cpuset semaphore depth check deadlock fix"

The kernel still works, even after this patch, except for some rare and
difficult to reproduce race conditions when agressively creating and
destroying cpusets marked with the notify_on_release option, on very large
systems.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 105 +++++++++++++++++++++-----------------------------------
 1 file changed, 40 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b9342f90d28f..cd54dba2be18 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,42 +180,6 @@ static struct super_block *cpuset_sb = NULL;
  */
 
 static DECLARE_MUTEX(cpuset_sem);
-static struct task_struct *cpuset_sem_owner;
-static int cpuset_sem_depth;
-
-/*
- * The global cpuset semaphore cpuset_sem can be needed by the
- * memory allocator to update a tasks mems_allowed (see the calls
- * to cpuset_update_current_mems_allowed()) or to walk up the
- * cpuset hierarchy to find a mem_exclusive cpuset see the calls
- * to cpuset_excl_nodes_overlap()).
- *
- * But if the memory allocation is being done by cpuset.c code, it
- * usually already holds cpuset_sem.  Double tripping on a kernel
- * semaphore deadlocks the current task, and any other task that
- * subsequently tries to obtain the lock.
- *
- * Run all up's and down's on cpuset_sem through the following
- * wrappers, which will detect this nested locking, and avoid
- * deadlocking.
- */
-
-static inline void cpuset_down(struct semaphore *psem)
-{
-	if (cpuset_sem_owner != current) {
-		down(psem);
-		cpuset_sem_owner = current;
-	}
-	cpuset_sem_depth++;
-}
-
-static inline void cpuset_up(struct semaphore *psem)
-{
-	if (--cpuset_sem_depth == 0) {
-		cpuset_sem_owner = NULL;
-		up(psem);
-	}
-}
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -558,10 +522,19 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Refresh current tasks mems_allowed and mems_generation from
  * current tasks cpuset.  Call with cpuset_sem held.
  *
- * This routine is needed to update the per-task mems_allowed
- * data, within the tasks context, when it is trying to allocate
- * memory (in various mm/mempolicy.c routines) and notices
- * that some other task has been modifying its cpuset.
+ * Be sure to call refresh_mems() on any cpuset operation which
+ * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * Call after obtaining cpuset_sem lock, before any possible
+ * allocation.  Otherwise one risks trying to allocate memory
+ * while the task cpuset_mems_generation is not the same as
+ * the mems_generation in its cpuset, which would deadlock on
+ * cpuset_sem in cpuset_update_current_mems_allowed().
+ *
+ * Since we hold cpuset_sem, once refresh_mems() is called, the
+ * test (current->cpuset_mems_generation != cs->mems_generation)
+ * in cpuset_update_current_mems_allowed() will remain false,
+ * until we drop cpuset_sem.  Anyone else who would change our
+ * cpusets mems_generation needs to lock cpuset_sem first.
  */
 
 static void refresh_mems(void)
@@ -867,7 +840,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -901,7 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -941,9 +914,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	mask = cs->cpus_allowed;
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -952,9 +925,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	mask = cs->mems_allowed;
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1351,7 +1324,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
+	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1376,14 +1350,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	kfree(cs);
 	return err;
 }
@@ -1405,13 +1379,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_sem already */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
+	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
@@ -1427,7 +1402,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1530,10 +1505,10 @@ void cpuset_exit(struct task_struct *tsk)
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		cpuset_down(&cpuset_sem);
+		down(&cpuset_sem);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -1554,11 +1529,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return mask;
 }
@@ -1583,9 +1558,9 @@ void cpuset_update_current_mems_allowed(void)
 	if (!cs)
 		return;		/* task is exiting */
 	if (current->cpuset_mems_generation != cs->mems_generation) {
-		cpuset_down(&cpuset_sem);
+		down(&cpuset_sem);
 		refresh_mems();
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 	}
 }
 
@@ -1684,14 +1659,14 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 0;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	cs = current->cpuset;
 	if (!cs)
 		goto done;		/* current task exiting */
 	cs = nearest_exclusive_ancestor(cs);
 	allowed = node_isset(node, cs->mems_allowed);
 done:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	return allowed;
 }
 
@@ -1712,7 +1687,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	cs1 = current->cpuset;
 	if (!cs1)
 		goto done;		/* current task exiting */
@@ -1723,7 +1698,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	cs2 = nearest_exclusive_ancestor(cs2);
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return overlap;
 }
@@ -1746,7 +1721,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	task_lock(tsk);
 	cs = tsk->cpuset;
 	task_unlock(tsk);
@@ -1761,7 +1736,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 053199edf54f685e7dea765b60d4d5e9070dadec Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:30 -0800
Subject: [PATCH] cpusets: dual semaphore locking overhaul

Overhaul cpuset locking.  Replace single semaphore with two semaphores.

The suggestion to use two locks was made by Roman Zippel.

Both locks are global.  Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers.  Before making actual
changes, the second semaphore, callback_sem must be acquired as well.  Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.

The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.

This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention.  Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)

This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman).  I've never
seen these races fail in any use or test.

See further the comments in the code.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +-
 kernel/cpuset.c       | 418 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 282 insertions(+), 138 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c30bc308ef1..b2d2dc14f0b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,7 +1211,7 @@ extern void unhash_process(struct task_struct *p);
 /*
  * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
- * pins the final release of task.io_context.
+ * pins the final release of task.io_context.  Also protects ->cpuset.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cd54dba2be18..7491352276b2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -60,6 +60,9 @@ struct cpuset {
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
+	/*
+	 * Count is atomic so can incr (fork) or decr (exit) without a lock.
+	 */
 	atomic_t count;			/* count tasks using this cpuset */
 
 	/*
@@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb = NULL;
 
 /*
- * cpuset_sem should be held by anyone who is depending on the children
- * or sibling lists of any cpuset, or performing non-atomic operations
- * on the flags or *_allowed values of a cpuset, such as raising the
- * CS_REMOVED flag bit iff it is not already raised, or reading and
- * conditionally modifying the *_allowed values.  One kernel global
- * cpuset semaphore should be sufficient - these things don't change
- * that much.
- *
- * The code that modifies cpusets holds cpuset_sem across the entire
- * operation, from cpuset_common_file_write() down, single threading
- * all cpuset modifications (except for counter manipulations from
- * fork and exit) across the system.  This presumes that cpuset
- * modifications are rare - better kept simple and safe, even if slow.
- *
- * The code that reads cpusets, such as in cpuset_common_file_read()
- * and below, only holds cpuset_sem across small pieces of code, such
- * as when reading out possibly multi-word cpumasks and nodemasks, as
- * the risks are less, and the desire for performance a little greater.
- * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
- * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
- *
- * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
- * (usually) grab cpuset_sem.  These are the two most performance
- * critical pieces of code here.  The exception occurs on exit(),
- * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * We have two global cpuset semaphores below.  They can nest.
+ * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * require taking task_lock() when dereferencing a tasks cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both semaphores to modify cpusets.  If a task
+ * holds manage_sem, then it blocks others wanting that semaphore,
+ * ensuring that it is the only task able to also acquire callback_sem
+ * and be able to modify cpusets.  It can perform various checks on
+ * the cpuset structure first, knowing nothing will change.  It can
+ * also allocate memory while just holding manage_sem.  While it is
+ * performing these checks, various callback routines can briefly
+ * acquire callback_sem to query cpusets.  Once it is ready to make
+ * the changes, it takes callback_sem, blocking everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_sem, as that would risk double tripping on callback_sem
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_sem, then it has read-only
+ * access to cpusets.
+ *
+ * The task_struct fields mems_allowed and mems_generation may only
+ * be accessed in the context of that task, so require no locks.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding manage_sem or callback_sem can't rely
+ * on the count field not changing.  However, if the count goes to
+ * zero, then only attach_task(), which holds both semaphores, can
+ * increment it again.  Because a count of zero means that no tasks
+ * are currently attached, therefore there is no way a task attached
+ * to that cpuset can fork (the other way to increment the count).
+ * So code holding manage_sem or callback_sem can safely assume that
+ * if the count is zero, it will stay zero.  Similarly, if a task
+ * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * knows that the cpuset won't be removed, as cpuset_rmdir() needs
+ * both of those semaphores.
+ *
+ * A possible optimization to improve parallelism would be to make
+ * callback_sem a R/W semaphore (rwsem), allowing the callback routines
+ * to proceed in parallel, with read access, until the holder of
+ * manage_sem needed to take this rwsem for exclusive write access
+ * and modify some cpusets.
+ *
+ * The cpuset_common_file_write handler for operations that modify
+ * the cpuset hierarchy holds manage_sem across the entire operation,
+ * single threading all such cpuset modifications across the system.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_sem across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
+ * (usually) take either semaphore.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cpuset_exit(),
+ * when a task in a notify_on_release cpuset exits.  Then manage_sem
  * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
- * A cpuset can only be deleted if both its 'count' of using tasks is
- * zero, and its list of 'children' cpusets is empty.  Since all tasks
- * in the system use _some_ cpuset, and since there is always at least
- * one task in the system (init, pid == 1), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So no need
- * for any special hack to ensure that top_cpuset cannot be deleted.
+ * A cpuset can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cpusets is empty.  Since all
+ * tasks in the system use _some_ cpuset, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks.  So we don't
+ * need a special hack to ensure that top_cpuset cannot be deleted.
+ *
+ * The above "Tale of Two Semaphores" would be complete, but for:
+ *
+ *	The task_lock() exception
+ *
+ * The need for this exception arises from the action of attach_task(),
+ * which overwrites one tasks cpuset pointer with another.  It does
+ * so using both semaphores, however there are several performance
+ * critical places that need to reference task->cpuset without the
+ * expense of grabbing a system global semaphore.  Therefore except as
+ * noted below, when dereferencing or, as in attach_task(), modifying
+ * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
+ * (task->alloc_lock) already in the task_struct routinely used for
+ * such matters.
  */
 
-static DECLARE_MUTEX(cpuset_sem);
+static DECLARE_MUTEX(manage_sem);
+static DECLARE_MUTEX(callback_sem);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 
 /*
- * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Call with manage_sem held.  Writes path of cpuset into buf.
  * Returns 0 on success, -errno on error.
  */
 
@@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * status of the /sbin/cpuset_release_agent task, so no sense holding
  * our caller up for that.
  *
- * The simple act of forking that task might require more memory,
- * which might need cpuset_sem.  So this routine must be called while
- * cpuset_sem is not held, to avoid a possible deadlock.  See also
- * comments for check_for_release(), below.
+ * When we had only one cpuset semaphore, we had to call this
+ * without holding it, to avoid deadlock when call_usermodehelper()
+ * allocated memory.  With two locks, we could now call this while
+ * holding manage_sem, but we still don't, so as to minimize
+ * the time manage_sem is held.
  */
 
 static void cpuset_release_agent(const char *pathbuf)
@@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf)
  * cs is notify_on_release() and now both the user count is zero and
  * the list of children is empty, prepare cpuset path in a kmalloc'd
  * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
- * Call here with cpuset_sem held.
+ * cpuset_release_agent() with it later on, once manage_sem is dropped.
+ * Call here with manage_sem held.
  *
  * This check_for_release() routine is responsible for kmalloc'ing
  * pathbuf.  The above cpuset_release_agent() is responsible for
  * kfree'ing pathbuf.  The caller of these routines is responsible
  * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with cpuset_sem held and the address
- * of the pathbuf pointer, then dropping cpuset_sem, then calling
+ * calling check_for_release() with manage_sem held and the address
+ * of the pathbuf pointer, then dropping manage_sem, then calling
  * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
@@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
  * One way or another, we guarantee to return some non-empty subset
  * of node_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 }
 
 /*
- * Refresh current tasks mems_allowed and mems_generation from
- * current tasks cpuset.  Call with cpuset_sem held.
- *
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation.  Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * Refresh current tasks mems_allowed and mems_generation from current
+ * tasks cpuset.
+ *
+ * Call without callback_sem or task_lock() held.  May be called with
+ * or without manage_sem held.  Will acquire task_lock() and might
+ * acquire callback_sem during call.
+ *
+ * The task_lock() is required to dereference current->cpuset safely.
+ * Without it, we could pick up the pointer value of current->cpuset
+ * in one instruction, and then attach_task could give us a different
+ * cpuset, and then the cpuset we had could be removed and freed,
+ * and then on our next instruction, we could dereference a no longer
+ * valid cpuset pointer to get its mems_generation field.
+ *
+ * This routine is needed to update the per-task mems_allowed data,
+ * within the tasks context, when it is trying to allocate memory
+ * (in various mm/mempolicy.c routines) and notices that some other
+ * task has been modifying its cpuset.
  */
 
 static void refresh_mems(void)
 {
-	struct cpuset *cs = current->cpuset;
+	int my_cpusets_mem_gen;
+
+	task_lock(current);
+	my_cpusets_mem_gen = current->cpuset->mems_generation;
+	task_unlock(current);
 
-	if (current->cpuset_mems_generation != cs->mems_generation) {
+	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
+		struct cpuset *cs;
+
+		down(&callback_sem);
+		task_lock(current);
+		cs = current->cpuset;
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(current);
+		up(&callback_sem);
 	}
 }
 
@@ -552,7 +616,7 @@ static void refresh_mems(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.
+ * are only set if the other's are set.  Call holding manage_sem.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * cpuset_sem held.
+ * manage_sem held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *    exclusive child cpusets
  * Build these two partitions by calling partition_sched_domains
  *
- * Call with cpuset_sem held.  May nest a call to the
+ * Call with manage_sem held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
 
@@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur)
 	unlock_cpu_hotplug();
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		return retval;
 	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	down(&callback_sem);
 	cs->cpus_allowed = trialcs.cpus_allowed;
+	up(&callback_sem);
 	if (is_cpu_exclusive(cs) && !cpus_unchanged)
 		update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
 	if (retval == 0) {
+		down(&callback_sem);
 		cs->mems_allowed = trialcs.mems_allowed;
 		atomic_inc(&cpuset_mems_generation);
 		cs->mems_generation = atomic_read(&cpuset_mems_generation);
+		up(&callback_sem);
 	}
 	return retval;
 }
@@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
  *						CS_NOTIFY_ON_RELEASE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
+ *
+ * Call with manage_sem held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		return err;
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	down(&callback_sem);
 	if (turning_on)
 		set_bit(bit, &cs->flags);
 	else
 		clear_bit(bit, &cs->flags);
+	up(&callback_sem);
 
 	if (cpu_exclusive_changed)
                 update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
+ * writing the path of the old cpuset in 'ppathbuf' if it needs to be
+ * notified on release.
+ *
+ * Call holding manage_sem.  May take callback_sem and task_lock of
+ * the task 'pid' during call.
+ */
+
 static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 {
 	pid_t pid;
@@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		read_lock(&tasklist_lock);
 
 		tsk = find_task_by_pid(pid);
-		if (!tsk) {
+		if (!tsk || tsk->flags & PF_EXITING) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
@@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		get_task_struct(tsk);
 	}
 
+	down(&callback_sem);
+
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
 	if (!oldcs) {
 		task_unlock(tsk);
+		up(&callback_sem);
 		put_task_struct(tsk);
 		return -ESRCH;
 	}
@@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	up(&callback_sem);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->cpus_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->mems_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1135,7 +1228,9 @@ struct ctr_struct {
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
- * Return actual number of pids loaded.
+ * Return actual number of pids loaded.  No need to task_lock(p)
+ * when reading out p->cpuset, as we don't really care if it changes
+ * on the next cycle, and we are not going to try to dereference it.
  */
 static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
 {
@@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 	return cnt;
 }
 
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cpuset being opened.
+ *
+ * Does not require any specific cpuset semaphores, and does not take any.
+ */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
@@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	cs->parent = parent;
 
+	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
 	if (err < 0)
 		goto err;
 
 	/*
-	 * Release cpuset_sem before cpuset_populate_dir() because it
+	 * Release manage_sem before cpuset_populate_dir() because it
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	up(&cpuset_sem);
+	up(&manage_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(cs);
 	return err;
 }
@@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_sem already */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
+	down(&callback_sem);
 	set_bit(CS_REMOVED, &cs->flags);
 	if (is_cpu_exclusive(cs))
 		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
-	if (list_empty(&parent->children))
-		check_for_release(parent, &pathbuf);
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	up(&cpuset_sem);
+	up(&callback_sem);
+	if (list_empty(&parent->children))
+		check_for_release(parent, &pathbuf);
+	up(&manage_sem);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void)
  * cpuset_fork - attach newly forked task to its parents cpuset.
  * @tsk: pointer to task_struct of forking parent process.
  *
- * Description: By default, on fork, a task inherits its
- * parent's cpuset.  The pointer to the shared cpuset is
- * automatically copied in fork.c by dup_task_struct().
- * This cpuset_fork() routine need only increment the usage
- * counter in that cpuset.
+ * Description: A task inherits its parent's cpuset at fork().
+ *
+ * A pointer to the shared cpuset was automatically copied in fork.c
+ * by dup_task_struct().  However, we ignore that copy, since it was
+ * not made under the protection of task_lock(), so might no longer be
+ * a valid cpuset pointer.  attach_task() might have already changed
+ * current->cpuset, allowing the previously referenced cpuset to
+ * be removed and freed.  Instead, we task_lock(current) and copy
+ * its present value of current->cpuset for our freshly forked child.
+ *
+ * At the point that cpuset_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
  **/
 
-void cpuset_fork(struct task_struct *tsk)
+void cpuset_fork(struct task_struct *child)
 {
-	atomic_inc(&tsk->cpuset->count);
+	task_lock(current);
+	child->cpuset = current->cpuset;
+	atomic_inc(&child->cpuset->count);
+	task_unlock(current);
 }
 
 /**
@@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
- * Note that cpusets marked notify_on_release force every task
- * in them to take the global cpuset_sem semaphore when exiting.
- * This could impact scaling on very large systems.  Be reluctant
- * to use notify_on_release cpusets where very high task exit
- * scaling is required on large systems.
- *
- * Don't even think about derefencing 'cs' after the cpuset use
- * count goes to zero, except inside a critical section guarded
- * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
- * then a zero cpuset use count is a license to any other task to
- * nuke the cpuset immediately.
+ * Note that cpusets marked notify_on_release force every task in
+ * them to take the global manage_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cpusets where very high task exit scaling
+ * is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use count
+ * goes to zero, except inside a critical section guarded by manage_sem
+ * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * any other task to nuke the cpuset immediately, via cpuset_rmdir().
+ *
+ * This routine has to take manage_sem, not callback_sem, because
+ * it is holding that semaphore while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback__sem().
+ *
+ * We don't need to task_lock() this reference to tsk->cpuset,
+ * because tsk is already marked PF_EXITING, so attach_task() won't
+ * mess with it.
  **/
 
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
-	task_lock(tsk);
+	BUG_ON(!(tsk->flags & PF_EXITING));
+
 	cs = tsk->cpuset;
 	tsk->cpuset = NULL;
-	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		down(&cpuset_sem);
+		down(&manage_sem);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		up(&cpuset_sem);
+		up(&manage_sem);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return mask;
 }
@@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void)
  * If the current tasks cpusets mems_allowed changed behind our backs,
  * update current->mems_allowed and mems_generation to the new value.
  * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Unless exiting, it will acquire
+ * task_lock().  Also might acquire callback_sem during call to
+ * refresh_mems().
  */
 
 void cpuset_update_current_mems_allowed(void)
 {
-	struct cpuset *cs = current->cpuset;
+	struct cpuset *cs;
+	int need_to_refresh = 0;
 
+	task_lock(current);
+	cs = current->cpuset;
 	if (!cs)
-		return;		/* task is exiting */
-	if (current->cpuset_mems_generation != cs->mems_generation) {
-		down(&cpuset_sem);
+		goto done;
+	if (current->cpuset_mems_generation != cs->mems_generation)
+		need_to_refresh = 1;
+done:
+	task_unlock(current);
+	if (need_to_refresh)
 		refresh_mems();
-		up(&cpuset_sem);
-	}
 }
 
 /**
@@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 
 /*
  * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * ancestor to the specified cpuset.  Call holding callback_sem.
  * If no ancestor is mem_exclusive (an unusual configuration), then
  * returns the root cpuset.
  */
@@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest mem_exclusive ancestor cpuset.
  *
- * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
  * routine only calls here with __GFP_HARDWALL bit _not_ set if
  * it's a GFP_KERNEL allocation, and all nodes in the current tasks
  * mems_allowed came up empty on the first pass over the zonelist.
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the cpuset_sem semaphore.
+ * short of memory, might require taking the callback_sem semaphore.
  *
  * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
  * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 0;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	down(&cpuset_sem);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;		/* current task exiting */
-	cs = nearest_exclusive_ancestor(cs);
+	down(&callback_sem);
+
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return 1;
+	task_lock(current);
+	cs = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
 	allowed = node_isset(node, cs->mems_allowed);
-done:
-	up(&cpuset_sem);
+	up(&callback_sem);
 	return allowed;
 }
 
@@ -1679,7 +1812,7 @@ done:
  * determine if task @p's memory usage might impact the memory
  * available to the current task.
  *
- * Acquires cpuset_sem - not suitable for calling from a fast path.
+ * Acquires callback_sem - not suitable for calling from a fast path.
  **/
 
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	down(&cpuset_sem);
-	cs1 = current->cpuset;
-	if (!cs1)
-		goto done;		/* current task exiting */
-	cs2 = p->cpuset;
-	if (!cs2)
-		goto done;		/* task p is exiting */
-	cs1 = nearest_exclusive_ancestor(cs1);
-	cs2 = nearest_exclusive_ancestor(cs2);
+	down(&callback_sem);
+
+	task_lock(current);
+	if (current->flags & PF_EXITING) {
+		task_unlock(current);
+		goto done;
+	}
+	cs1 = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
+	task_lock((struct task_struct *)p);
+	if (p->flags & PF_EXITING) {
+		task_unlock((struct task_struct *)p);
+		goto done;
+	}
+	cs2 = nearest_exclusive_ancestor(p->cpuset);
+	task_unlock((struct task_struct *)p);
+
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return overlap;
 }
@@ -1707,6 +1849,10 @@ done:
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
  *  - Used for /proc/<pid>/cpuset.
+ *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ *    doesn't really matter if tsk->cpuset changes after we read it,
+ *    and we take manage_sem, keeping attach_task() from changing it
+ *    anyway.
  */
 
 static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	down(&cpuset_sem);
-	task_lock(tsk);
+	down(&manage_sem);
 	cs = tsk->cpuset;
-	task_unlock(tsk);
 	if (!cs) {
 		retval = -EINVAL;
 		goto out;
@@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 18a19cb3047e454ee5ecbc35d7acf3f8e09e0466 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:31 -0800
Subject: [PATCH] cpusets: simple rename

Add support for renaming cpusets.  Only allow simple rename of cpuset
directories in place.  Don't allow moving cpusets elsewhere in hierarchy or
renaming the special cpuset files in each cpuset directory.

The usefulness of this simple rename became apparent when developing task
migration facilities.  It allows building a second cpuset hierarchy using
new names and containing new CPUs and Memory Nodes, moving tasks from the
old to the new cpusets, removing the old cpusets, and then renaming the new
cpusets to be just like the old names, so that any knowledge that the tasks
had of their cpuset names will still be valid.

Leaf node cpusets can be migrated to other CPUs or Memory Nodes by just
updating their 'cpus' and 'mems' files, but because no cpuset can contain
CPUs or Nodes not in its parent cpuset, one cannot do this in a cpuset
hierarchy without first expanding all the non-leaf cpusets to contain the
union of both the old and new CPUs and Nodes, which would obfuscate the
one-to-one migration of a task from one cpuset to another required to
correctly migrate the physical page frames currently allocated to that
task.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7491352276b2..6633f3fb6417 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1113,6 +1113,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * cpuset_rename - Only allow simple rename of directories in place.
+ */
+static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
+                  struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (!S_ISDIR(old_dentry->d_inode->i_mode))
+		return -ENOTDIR;
+	if (new_dentry->d_inode)
+		return -EEXIST;
+	if (old_dir != new_dir)
+		return -EIO;
+	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static struct file_operations cpuset_file_operations = {
 	.read = cpuset_file_read,
 	.write = cpuset_file_write,
@@ -1125,6 +1140,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
 	.lookup = simple_lookup,
 	.mkdir = cpuset_mkdir,
 	.rmdir = cpuset_rmdir,
+	.rename = cpuset_rename,
 };
 
 static int cpuset_create_file(struct dentry *dentry, int mode)
-- 
cgit v1.2.3


From 68860ec10bcc07ab4f89f9d940e3b77ae5ca13b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:36 -0800
Subject: [PATCH] cpusets: automatic numa mempolicy rebinding

This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes.  It does so within the context of the task,
without any need to support low level external mempolicy manipulation.

If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op.  Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply.  Otherwise, the main routine below,
rebind_policy() is not even called.

When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.

However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.

This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.

Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position.  This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity.  This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.

However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed.  All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy.  The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice.  The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.

This patch leaves open a couple of issues:

 1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:

    These mempolicies may reference nodes outside of those allowed to
    the current task by its cpuset.  Tasks are migrated as part of jobs,
    which reside on what might be several cpusets in a subtree.  When such
    a job is migrated, all NUMA memory policy references to nodes within
    that cpuset subtree should be translated, and references to any nodes
    outside that subtree should be left untouched.  A future patch will
    provide the cpuset mechanism needed to mark such subtrees.  With that
    patch, we will be able to correctly migrate these other memory policies
    across a job migration.

 2) Updating cpuset, affinity and memory policies in user space:

    This is harder.  Any placement state stored in user space using
    system-wide numbering will be invalidated across a migration.  More
    work will be required to provide user code with a migration-safe means
    to manage its cpuset relative placement, while preserving the current
    API's that pass system wide numbers, not cpuset relative numbers across
    the kernel-user boundary.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  6 +++++
 kernel/cpuset.c           |  4 +++
 mm/mempolicy.c            | 64 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7af8cb836e78..8b67cf837ca9 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -154,6 +154,7 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
+extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
 extern struct mempolicy default_policy;
 
 #else
@@ -226,6 +227,11 @@ static inline void numa_default_policy(void)
 {
 }
 
+static inline void numa_policy_rebind(const nodemask_t *old,
+					const nodemask_t *new)
+{
+}
+
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6633f3fb6417..5a737ed9dac7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -600,6 +601,7 @@ static void refresh_mems(void)
 
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
+		nodemask_t oldmem = current->mems_allowed;
 
 		down(&callback_sem);
 		task_lock(current);
@@ -608,6 +610,8 @@ static void refresh_mems(void)
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
+		if (!nodes_equal(oldmem, current->mems_allowed))
+			numa_policy_rebind(&oldmem, &current->mems_allowed);
 	}
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2076b1542b8a..5abc57c2b8bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -457,6 +457,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
+	cpuset_update_current_mems_allowed();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -1206,3 +1207,66 @@ void numa_default_policy(void)
 {
 	do_set_mempolicy(MPOL_DEFAULT, NULL);
 }
+
+/* Migrate a policy to a different set of nodes */
+static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+							const nodemask_t *new)
+{
+	nodemask_t tmp;
+
+	if (!pol)
+		return;
+
+	switch (pol->policy) {
+	case MPOL_DEFAULT:
+		break;
+	case MPOL_INTERLEAVE:
+		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		pol->v.nodes = tmp;
+		current->il_next = node_remap(current->il_next, *old, *new);
+		break;
+	case MPOL_PREFERRED:
+		pol->v.preferred_node = node_remap(pol->v.preferred_node,
+								*old, *new);
+		break;
+	case MPOL_BIND: {
+		nodemask_t nodes;
+		struct zone **z;
+		struct zonelist *zonelist;
+
+		nodes_clear(nodes);
+		for (z = pol->v.zonelist->zones; *z; z++)
+			node_set((*z)->zone_pgdat->node_id, nodes);
+		nodes_remap(tmp, nodes, *old, *new);
+		nodes = tmp;
+
+		zonelist = bind_zonelist(&nodes);
+
+		/* If no mem, then zonelist is NULL and we keep old zonelist.
+		 * If that old zonelist has no remaining mems_allowed nodes,
+		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
+		 */
+
+		if (zonelist) {
+			/* Good - got mem - substitute new zonelist */
+			kfree(pol->v.zonelist);
+			pol->v.zonelist = zonelist;
+		}
+		break;
+	}
+	default:
+		BUG();
+		break;
+	}
+}
+
+/*
+ * Someone moved this task to different nodes.  Fixup mempolicies.
+ *
+ * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ */
+void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+{
+	rebind_policy(current->mempolicy, old, new);
+}
-- 
cgit v1.2.3


From 30e0fca6c1d7d26f3f2daa4dd2b12c51dadc778a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Sun, 30 Oct 2005 15:02:38 -0800
Subject: [PATCH] ptrace/coredump/exit_group deadlock

I could seldom reproduce a deadlock with a task not killable in T state
(TASK_STOPPED, not TASK_TRACED) by attaching a NPTL threaded program to
gdb, by segfaulting the task and triggering a core dump while some other
task is executing exit_group and while one task is in ptrace_attached
TASK_STOPPED state (not TASK_TRACED yet).  This originated from a gdb
bugreport (the fact gdb was segfaulting the task wasn't a kernel bug), but
I just incidentally noticed the gdb bug triggered a real kernel bug as
well.

Most threads hangs in exit_mm because the core_dumping is still going, the
core dumping hangs because the stopped task doesn't exit, the stopped task
can't wakeup because it has SIGNAL_GROUP_EXIT set, hence the deadlock.

To me it seems that the problem is that the force_sig_specific(SIGKILL) in
zap_threads is a noop if the task has PF_PTRACED set (like in this case
because gdb is attached).  The __ptrace_unlink does nothing because the
signal->flags is set to SIGNAL_GROUP_EXIT|SIGNAL_STOP_DEQUEUED (verified).

The above info also shows that the stopped task hit a race and got the stop
signal (presumably by the ptrace_attach, only the attach, state is still
TASK_STOPPED and gdb hangs waiting the core before it can set it to
TASK_TRACED) after one of the thread invoked the core dump (it's the core
dump that sets signal->flags to SIGNAL_GROUP_EXIT).

So beside the fact nobody would wakeup the task in __ptrace_unlink (the
state is _not_ TASK_TRACED), there's a secondary problem in the signal
handling code, where a task should ignore the ptrace-sigstops as long as
SIGNAL_GROUP_EXIT is set (or the wakeup in __ptrace_unlink path wouldn't be
enough).

So I attempted to make this patch that seems to fix the problem.  There
were various ways to fix it, perhaps you prefer a different one, I just
opted to the one that looked safer to me.

I also removed the clearing of the stopped bits from the zap_other_threads
(zap_other_threads was safe unlike zap_threads).  I don't like useless
code, this whole NPTL signal/ptrace thing is already unreadable enough and
full of corner cases without confusing useless code into it to make it even
less readable.  And if this code is really needed, then you may want to
explain why it's not being done in the other paths that sets
SIGNAL_GROUP_EXIT at least.

Even after this patch I still wonder who serializes the read of
p->ptrace in zap_threads.

Patch is called ptrace-core_dump-exit_group-deadlock-1.

This was the trace I've got:

test          T ffff81003e8118c0     0 14305      1         14311 14309 (NOTLB)
ffff810058ccdde8 0000000000000082 000001f4000037e1 ffff810000000013
       00000000000000f8 ffff81003e811b00 ffff81003e8118c0 ffff810011362100
       0000000000000012 ffff810017ca4180
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff80141677>{finish_stop+87}
       <ffffffff8014367f>{get_signal_to_deliver+1359} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80111575>{sys_ptrace+2293}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80196399>{sys_ioctl+73}
       <ffffffff8010dd27>{sysret_signal+28} <ffffffff8010e00f>{ptregscall_common+103}

test          D ffff810011362100     0 14309      1         14305 14312 (NOTLB)
ffff810053c81cf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000195 ffff810011362340 ffff810011362100 ffff81002e338040
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80138d0c>{do_group_exit+252}
       <ffffffff801436db>{get_signal_to_deliver+1451} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff8014208a>{force_sig_info+186} <ffffffff804479a0>{do_int3+112}
       <ffffffff8010e308>{retint_signal+61}
test          D ffff81002e338040     0 14311      1         14716 14305 (NOTLB)
ffff81005ca8dcf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000120 ffff81002e338280 ffff81002e338040 ffff8100481cb740
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80142d0e>{__dequeue_signal+558}
       <ffffffff80138d0c>{do_group_exit+252} <ffffffff801436db>{get_signal_to_deliver+1451}
       <ffffffff8010d3ad>{do_signal+157} <ffffffff8013deee>{ptrace_check_attach+222}
       <ffffffff80140850>{specific_send_sig_info+208} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804479a0>{do_int3+112} <ffffffff8010e308>{retint_signal+61}

test          D ffff810017ca4180     0 14312      1         14309 13882 (NOTLB)
ffff81005d15fcb8 0000000000000082 ffff81005d15fc58 ffffffff80130816
       0000000000000897 ffff810017ca43c0 ffff810017ca4180 ffff81003e8118c0
       0000000000000082 ffffffff801317ed
Call Trace:<ffffffff80130816>{activate_task+150} <ffffffff801317ed>{try_to_wake_up+893}
       <ffffffff8044677d>{wait_for_completion+173} <ffffffff80131810>{default_wake_function+0}
       <ffffffff8018cdc3>{do_coredump+819} <ffffffff80445f52>{thread_return+82}
       <ffffffff801436d4>{get_signal_to_deliver+1444} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff804472e5>{_spin_unlock_irqrestore+5} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804476ff>{do_general_protection+159} <ffffffff8010e308>{retint_signal+61}

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 7 +++++--
 kernel/signal.c | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 019e04ec065a..863eee8bff47 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
 			signal_wake_up(child, 1);
 		}
 	}
+	if (child->signal->flags & SIGNAL_GROUP_EXIT) {
+		sigaddset(&child->pending.signal, SIGKILL);
+		signal_wake_up(child, 1);
+	}
 	spin_unlock(&child->sighand->siglock);
 }
 
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
 		SET_LINKS(child);
 	}
 
-	if (child->state == TASK_TRACED)
-		ptrace_untrace(child);
+	ptrace_untrace(child);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d905ec74bde..9d1512dcf176 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1108,8 +1108,8 @@ void zap_other_threads(struct task_struct *p)
 		if (t != p->group_leader)
 			t->exit_signal = -1;
 
+		/* SIGKILL will be handled before any pending SIGSTOP */
 		sigaddset(&t->pending.signal, SIGKILL);
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 		signal_wake_up(t, 1);
 	}
 }
@@ -1879,9 +1879,9 @@ relock:
 			/* Let the debugger run.  */
 			ptrace_stop(signr, signr, info);
 
-			/* We're back.  Did the debugger cancel the sig?  */
+			/* We're back.  Did the debugger cancel the sig or group_exit? */
 			signr = current->exit_code;
-			if (signr == 0)
+			if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
 				continue;
 
 			current->exit_code = 0;
-- 
cgit v1.2.3


From 20e1129ab831486c811d50d3905343ad48c4275f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 30 Oct 2005 15:02:44 -0800
Subject: [PATCH] Keys: Get rid of warning in kmod.c if keys disabled

The attached patch gets rid of a "statement without effect" warning when
CONFIG_KEYS is disabled by making use of the return value of key_get().
The compiler will optimise all of this away when keys are disabled.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8af..51a892063aaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *old_session;
+	struct key *new_session, *old_session;
 	int retval;
 
 	/* Unblock all signals and set the session keyring. */
-	key_get(sub_info->ring);
+	new_session = key_get(sub_info->ring);
 	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, sub_info->ring);
+	old_session = __install_session_keyring(current, new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
-- 
cgit v1.2.3


From 40dc565122ed1e180a0637f88cdfca734d33db78 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sun, 30 Oct 2005 15:02:46 -0800
Subject: [PATCH] cleanup for kernel/printk.c

- Removes some trailing whitespace

- Breaks long lines and make other small changes to conform to CodingStyle

- Add explicit printk loglevels in two places.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 78 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 4b8f0f9230a4..3cb9708209bc 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
  * elsewhere, in preparation for a serial line console (someday).
  * Ted Ts'o, 2/11/93.
  * Modified for sysctl support, 1/8/97, Chris Horn.
- * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
  *     manfreds@colorfullife.com
  * Rewrote bits to get rid of console_lock
  *	01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
 	if (!strcmp(str, "ttyb"))
 		strcpy(name, "ttyS1");
 #endif
-	for(s = name; *s; s++)
+	for (s = name; *s; s++)
 		if ((*s >= '0' && *s <= '9') || *s == ',')
 			break;
 	idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
 		size = roundup_pow_of_two(size);
 	if (size > log_buf_len) {
 		unsigned long start, dest_idx, offset;
-		char * new_log_buf;
+		char *new_log_buf;
 
 		new_log_buf = alloc_bootmem(size);
 		if (!new_log_buf) {
-			printk("log_buf_len: allocation failed\n");
+			printk(KERN_WARNING "log_buf_len: allocation failed\n");
 			goto out;
 		}
 
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
 		log_end -= offset;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 
-		printk("log_buf_len: %d\n", log_buf_len);
+		printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
 	}
 out:
-
 	return 1;
 }
 
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
  *	9 -- Return number of unread characters in the log buffer
  *     10 -- Return size of the log buffer
  */
-int do_syslog(int type, char __user * buf, int len)
+int do_syslog(int type, char __user *buf, int len)
 {
 	unsigned long i, j, limit, count;
 	int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait, (log_start - log_end));
+		error = wait_event_interruptible(log_wait,
+							(log_start - log_end));
 		if (error)
 			goto out;
 		i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
 			error = i;
 		break;
 	case 4:		/* Read/clear last kernel messages */
-		do_clear = 1; 
+		do_clear = 1;
 		/* FALL THRU */
 	case 3:		/* Read last kernel messages */
 		error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
 		limit = log_end;
 		/*
 		 * __put_user() could sleep, and while we sleep
-		 * printk() could overwrite the messages 
+		 * printk() could overwrite the messages
 		 * we try to copy to user space. Therefore
 		 * the messages are copied in reverse. <manfreds>
 		 */
-		for(i = 0; i < count && !error; i++) {
+		for (i = 0; i < count && !error; i++) {
 			j = limit-1-i;
 			if (j + log_buf_len < log_end)
 				break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
 		if (error)
 			break;
 		error = i;
-		if(i != count) {
+		if (i != count) {
 			int offset = count-error;
 			/* buffer overflow during copy, correct user buffer. */
-			for(i=0;i<error;i++) {
+			for (i = 0; i < error; i++) {
 				if (__get_user(c,&buf[i+offset]) ||
 				    __put_user(c,&buf[i])) {
 					error = -EFAULT;
@@ -351,7 +351,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return do_syslog(type, buf, len);
 }
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 	cur_index = start;
 	start_print = start;
 	while (cur_index != end) {
-		if (	msg_level < 0 &&
-			((end - cur_index) > 2) &&
-			LOG_BUF(cur_index + 0) == '<' &&
-			LOG_BUF(cur_index + 1) >= '0' &&
-			LOG_BUF(cur_index + 1) <= '7' &&
-			LOG_BUF(cur_index + 2) == '>')
-		{
+		if (msg_level < 0 && ((end - cur_index) > 2) &&
+				LOG_BUF(cur_index + 0) == '<' &&
+				LOG_BUF(cur_index + 1) >= '0' &&
+				LOG_BUF(cur_index + 1) <= '7' &&
+				LOG_BUF(cur_index + 2) == '>') {
 			msg_level = LOG_BUF(cur_index + 1) - '0';
 			cur_index += 3;
 			start_print = cur_index;
 		}
 		while (cur_index != end) {
 			char c = LOG_BUF(cur_index);
-			cur_index++;
 
+			cur_index++;
 			if (c == '\n') {
 				if (msg_level < 0) {
 					/*
@@ -461,7 +459,7 @@ static void zap_locks(void)
 	static unsigned long oops_timestamp;
 
 	if (time_after_eq(jiffies, oops_timestamp) &&
-			!time_after(jiffies, oops_timestamp + 30*HZ))
+			!time_after(jiffies, oops_timestamp + 30 * HZ))
 		return;
 
 	oops_timestamp = jiffies;
@@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void)
 
 /*
  * This is printk.  It can be called from any context.  We want it to work.
- * 
+ *
  * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
  * call the console drivers.  If we fail to get the semaphore we place the output
  * into the log buffer and return.  The current holder of the console_sem will
@@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk);
 
 #else
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return 0;
 }
 
-int do_syslog(int type, char __user * buf, int len) { return 0; }
-static void call_console_drivers(unsigned long start, unsigned long end) {}
+int do_syslog(int type, char __user *buf, int len)
+{
+	return 0;
+}
+
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+}
 
 #endif
 
@@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start);
  * print any messages that were printed by the kernel before the
  * console driver was initialized.
  */
-void register_console(struct console * console)
+void register_console(struct console *console)
 {
-	int     i;
+	int i;
 	unsigned long flags;
 
 	if (preferred_console < 0)
@@ -878,7 +882,8 @@ void register_console(struct console * console)
 	 *	See if this console matches one we selected on
 	 *	the command line.
 	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+			i++) {
 		if (strcmp(console_cmdline[i].name, console->name) != 0)
 			continue;
 		if (console->index >= 0 &&
@@ -933,9 +938,9 @@ void register_console(struct console * console)
 }
 EXPORT_SYMBOL(register_console);
 
-int unregister_console(struct console * console)
+int unregister_console(struct console *console)
 {
-        struct console *a,*b;
+        struct console *a, *b;
 	int res = 1;
 
 	acquire_console_sem();
@@ -949,10 +954,10 @@ int unregister_console(struct console * console)
 				b->next = a->next;
 				res = 0;
 				break;
-			}  
+			}
 		}
 	}
-	
+
 	/* If last console is removed, we re-enable picking the first
 	 * one that gets registered. Without that, pmac early boot console
 	 * would prevent fbcon from taking over.
@@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
 	static DEFINE_SPINLOCK(ratelimit_lock);
-	static unsigned long toks = 10*5*HZ;
+	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
 	unsigned long flags;
@@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 		toks = ratelimit_burst * ratelimit_jiffies;
 	if (toks >= ratelimit_jiffies) {
 		int lost = missed;
+
 		missed = 0;
 		toks -= ratelimit_jiffies;
 		spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 EXPORT_SYMBOL(__printk_ratelimit);
 
 /* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5*HZ;
+int printk_ratelimit_jiffies = 5 * HZ;
 
 /* number of messages we send before ratelimiting */
 int printk_ratelimit_burst = 10;
-- 
cgit v1.2.3


From 7407251a0e2ed099e4b12b742b635503e981507c Mon Sep 17 00:00:00 2001
From: Coywolf Qi Hunt <qiyong@fc-cn.com>
Date: Sun, 30 Oct 2005 15:02:47 -0800
Subject: [PATCH] PF_DEAD cleanup

The PF_DEAD setting doesn't belong to exit_notify(), move it to a proper
place.

Signed-off-by: Coywolf Qi Hunt <qiyong@fc-cn.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 79f52b85d6ed..6ef8f7356a74 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk)
 	/* If the process is dead, release it - nobody will wait for it */
 	if (state == EXIT_DEAD)
 		release_task(tsk);
-
-	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
-	tsk->flags |= PF_DEAD;
 }
 
 fastcall NORET_TYPE void do_exit(long code)
@@ -873,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code)
 	tsk->mempolicy = NULL;
 #endif
 
-	BUG_ON(!(current->flags & PF_DEAD));
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+	preempt_disable();
+	BUG_ON(tsk->flags & PF_DEAD);
+	tsk->flags |= PF_DEAD;
+
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
-- 
cgit v1.2.3


From 7f2a52555998c699a7e89f24636c909d6fc08a60 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Oct 2005 15:02:50 -0800
Subject: [PATCH] wait4 PTRACE_ATTACH race fix

Back about a year ago when I last fiddled heavily with the do_wait code, I
was thinking too hard about the wrong thing and I now think I introduced a
bug whose inverse thought I was fixing.

Apparently noone was looking too hard over much shoulder, so as to cite my
bogus reasoning at the time.  In the race condition when PTRACE_ATTACH is
about to steal a child and then the child hits a tracing event (what
my_ptrace_child checks for), the real parent does need to set its flag
noting it has some eligible live children.  Otherwise a spurious ECHILD
error is possible, since the child in question is not yet on the
ptrace_children list.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6ef8f7356a74..2d39ccc367e6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1383,6 +1383,15 @@ repeat:
 
 			switch (p->state) {
 			case TASK_TRACED:
+				/*
+				 * When we hit the race with PTRACE_ATTACH,
+				 * we will not report this child.  But the
+				 * race means it has not yet been moved to
+				 * our ptrace_children list, so we need to
+				 * set the flag here to avoid a spurious ECHILD
+				 * when the race happens with the only child.
+				 */
+				flag = 1;
 				if (!my_ptrace_child(p))
 					continue;
 				/*FALLTHROUGH*/
-- 
cgit v1.2.3


From ecea8d19c9f0ebd62ddaa07fc919ff4e4b820d99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 30 Oct 2005 15:03:00 -0800
Subject: [PATCH] jiffies_64 cleanup

Define jiffies_64 in kernel/timer.c rather than having 24 duplicated
defines in each architecture.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c     | 4 ----
 arch/arm/kernel/time.c       | 4 ----
 arch/arm26/kernel/time.c     | 4 ----
 arch/cris/kernel/time.c      | 4 ----
 arch/frv/kernel/time.c       | 3 ---
 arch/h8300/kernel/time.c     | 4 ----
 arch/i386/kernel/time.c      | 4 ----
 arch/ia64/kernel/time.c      | 4 ----
 arch/m32r/kernel/time.c      | 4 ----
 arch/m68k/kernel/time.c      | 4 ----
 arch/m68knommu/kernel/time.c | 4 ----
 arch/mips/kernel/time.c      | 4 ----
 arch/parisc/kernel/time.c    | 4 ----
 arch/ppc/kernel/time.c       | 5 -----
 arch/ppc64/kernel/time.c     | 4 ----
 arch/s390/kernel/time.c      | 4 ----
 arch/sh/kernel/time.c        | 4 ----
 arch/sh64/kernel/time.c      | 2 --
 arch/sparc/kernel/time.c     | 4 ----
 arch/sparc64/kernel/time.c   | 4 ----
 arch/um/kernel/time_kern.c   | 4 ----
 arch/v850/kernel/time.c      | 4 ----
 arch/x86_64/kernel/time.c    | 4 ----
 arch/xtensa/kernel/time.c    | 3 ---
 kernel/timer.c               | 4 ++++
 25 files changed, 4 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 67be50b7d80a..6b2921be1909 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -55,10 +55,6 @@
 #include "proto.h"
 #include "irq_impl.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;	/* kernel/timer.c */
 
 static int set_rtc_mmss(unsigned long);
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 69449a818dcc..fc4729106a32 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -36,10 +36,6 @@
 #include <asm/thread_info.h>
 #include <asm/mach/time.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * Our system timer.
  */
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index e66aedd02fad..335525339ad6 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -34,10 +34,6 @@
 #include <asm/irq.h>
 #include <asm/ioc.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 
 /* this needs a better home */
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index a2d99b4aedcd..b863815de78d 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -32,10 +32,6 @@
 #include <linux/init.h>
 #include <linux/profile.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 int have_rtc;  /* used to remember if we have an RTC or not */;
 
 #define TICK_SIZE tick
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index f43b734482e3..2e9741227b73 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -34,9 +34,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned long __nongprelbss __clkin_clock_speed_HZ;
 unsigned long __nongprelbss __ext_bus_clock_speed_HZ;
 unsigned long __nongprelbss __res_bus_clock_speed_HZ;
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index af8c5d2057dd..688a5100604c 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -32,10 +32,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-u64 jiffies_64;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 46c35ec9137d..07471bba2dc6 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -74,10 +74,6 @@ int pit_latch_buggy;              /* extern */
 
 #include "do_timer.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);
 
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 8b8a5a45b621..5b7e736f3b49 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -32,10 +32,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 539c562cd54d..2ebce2063fea 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -39,10 +39,6 @@ extern void send_IPI_allbutself(int, int);
 extern void smp_local_timer_interrupt(struct pt_regs *);
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 #define TICK_SIZE	(tick_nsec / 1000)
 
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 4ec95e3cb874..98e4b1adfa29 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -27,10 +27,6 @@
 #include <linux/timex.h>
 #include <linux/profile.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static inline int set_rtc_mmss(unsigned long nowtime)
 {
   if (mach_set_clock_mmss)
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index b17c1ecba966..b9d8abb45430 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -27,10 +27,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 
 
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index a24651dfaaba..787ed541d442 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -45,10 +45,6 @@
 
 #define TICK_SIZE	(tick_nsec / 1000)
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * forward reference
  */
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index bc979e1abdec..cded25680787 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -33,10 +33,6 @@
 
 #include <linux/timex.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* xtime and wall_jiffies keep wall-clock time */
 extern unsigned long wall_jiffies;
 
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index 22d7fd1e0aea..67797184f4eb 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -66,11 +66,6 @@
 
 #include <asm/time.h>
 
-/* XXX false sharing with below? */
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned long disarm_decr[NR_CPUS];
 
 extern struct timezone sys_tz;
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index b56c6a324e17..ab565468578a 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -68,10 +68,6 @@
 #include <asm/systemcfg.h>
 #include <asm/firmware.h>
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;
 extern int piranha_simulator;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 2fd75da15495..9a1d95894f3d 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -49,10 +49,6 @@
 
 #define TICK_SIZE tick
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static ext_int_info_t ext_int_info_cc;
 static u64 init_timer_cc;
 static u64 jiffies_timer_cc;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 02ca69918d7c..671b876416bf 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -56,10 +56,6 @@ extern unsigned long wall_jiffies;
 #define TICK_SIZE (tick_nsec / 1000)
 DEFINE_SPINLOCK(tmu0_lock);
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* XXX: Can we initialize this in a routine somewhere?  Dreamcast doesn't want
  * these routines anywhere... */
 #ifdef CONFIG_SH_RTC
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 43e395a14f49..870fe5327e09 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -116,8 +116,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
 static unsigned long tmu_base, rtc_base;
 unsigned long cprc_base;
 
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 279a62627c10..24814d58f9e1 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -45,10 +45,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 DEFINE_SPINLOCK(rtc_lock);
 enum sparc_clock_type sp_clock_typ;
 DEFINE_SPINLOCK(mostek_lock);
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 3f08a32f51a1..38c5525087a2 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -55,10 +55,6 @@ unsigned long ds1287_regs = 0UL;
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static void __iomem *mstk48t08_regs;
 static void __iomem *mstk48t59_regs;
 
diff --git a/arch/um/kernel/time_kern.c b/arch/um/kernel/time_kern.c
index 4e08f7545d63..020ca79b8d33 100644
--- a/arch/um/kernel/time_kern.c
+++ b/arch/um/kernel/time_kern.c
@@ -22,10 +22,6 @@
 #include "mode.h"
 #include "os.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 int hz(void)
 {
 	return(HZ);
diff --git a/arch/v850/kernel/time.c b/arch/v850/kernel/time.c
index ea3fd8844ff0..c1e85c2aef65 100644
--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -26,10 +26,6 @@
 
 #include "mach.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TICK_SIZE	(tick_nsec / 1000)
 
 /*
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 47d25ad08160..bd5ea09bfdf9 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,6 @@
 #include <asm/apic.h>
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #ifdef CONFIG_CPU_FREQ
 static void cpufreq_delayed_get(void);
 #endif
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 8e423d1335ce..cb6e38ed2b1d 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -29,9 +29,6 @@
 
 extern volatile unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-EXPORT_SYMBOL(jiffies_64);
-
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 EXPORT_SYMBOL(rtc_lock);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 562d53eabb46..fd74268d8663 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
 #define time_interpolator_update(x)
 #endif
 
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
 /*
  * per-CPU timer vector definitions:
  */
-- 
cgit v1.2.3


From a241ec65aeac3d69a08a7b153cccbdb7ea35063f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 30 Oct 2005 15:03:12 -0800
Subject: [PATCH] RCU torture-testing kernel module

This patch is a rewrite of the one submitted on October 1st, using modules
(http://marc.theaimsgroup.com/?l=linux-kernel&m=112819093522998&w=2).

This rewrite adds a tristate CONFIG_RCU_TORTURE_TEST, which enables an
intense torture test of the RCU infratructure.  This is needed due to the
continued changes to the RCU infrastructure to accommodate dynamic ticks,
CPU hotplug, realtime, and so on.  Most of the code is in a separate file
that is compiled only if the CONFIG variable is set.  Documentation on how
to run the test and interpret the output is also included.

This code has been tested on i386 and ppc64, and an earlier version of the
code has received extensive testing on a number of architectures as part of
the PREEMPT_RT patchset.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/torture.txt | 122 +++++++++++
 include/linux/rcupdate.h      |   1 +
 kernel/Makefile               |   1 +
 kernel/rcupdate.c             |  10 +
 kernel/rcutorture.c           | 492 ++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug             |  21 ++
 mm/mmap.c                     |   2 +-
 7 files changed, 648 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/RCU/torture.txt
 create mode 100644 kernel/rcutorture.c

(limited to 'kernel')

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
new file mode 100644
index 000000000000..e4c38152f7f7
--- /dev/null
+++ b/Documentation/RCU/torture.txt
@@ -0,0 +1,122 @@
+RCU Torture Test Operation
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations.  It creates an rcutorture kernel module that can
+be loaded to run a torture test.  The test periodically outputs
+status messages via printk(), which can be examined via the dmesg
+command (perhaps grepping for "rcutorture").  The test is started
+when the module is loaded, and stops when the module is unloaded.
+
+However, actually setting this config option to "y" results in the system
+running the test immediately upon boot, and ending only when the system
+is taken down.  Normally, one will instead want to build the system
+with CONFIG_RCU_TORTURE_TEST=m and to use modprobe and rmmod to control
+the test, perhaps using a script similar to the one shown at the end of
+this document.  Note that you will need CONFIG_MODULE_UNLOAD in order
+to be able to end the test.
+
+
+MODULE PARAMETERS
+
+This module has the following parameters:
+
+nreaders	This is the number of RCU reading threads supported.
+		The default is twice the number of CPUs.  Why twice?
+		To properly exercise RCU implementations with preemptible
+		read-side critical sections.
+
+stat_interval	The number of seconds between output of torture
+		statistics (via printk()).  Regardless of the interval,
+		statistics are printed when the module is unloaded.
+		Setting the interval to zero causes the statistics to
+		be printed -only- when the module is unloaded, and this
+		is the default.
+
+verbose		Enable debug printk()s.  Default is disabled.
+
+
+OUTPUT
+
+The statistics output is as follows:
+
+	rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+	rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
+	rcutorture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
+	rcutorture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
+	rcutorture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+	rcutorture: --- End of test
+
+The command "dmesg | grep rcutorture:" will extract this information on
+most systems.  On more esoteric configurations, it may be necessary to
+use other commands to access the output of the printk()s used by
+the RCU torture test.  The printk()s use KERN_ALERT, so they should
+be evident.  ;-)
+
+The entries are as follows:
+
+o	"ggp": The number of counter flips (or batches) since boot.
+
+o	"rtc": The hexadecimal address of the structure currently visible
+	to readers.
+
+o	"ver": The number of times since boot that the rcutw writer task
+	has changed the structure visible to readers.
+
+o	"tfle": If non-zero, indicates that the "torture freelist"
+	containing structure to be placed into the "rtc" area is empty.
+	This condition is important, since it can fool you into thinking
+	that RCU is working when it is not.  :-/
+
+o	"rta": Number of structures allocated from the torture freelist.
+
+o	"rtaf": Number of allocations from the torture freelist that have
+	failed due to the list being empty.
+
+o	"rtf": Number of frees into the torture freelist.
+
+o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+	If any entries past the first two are non-zero, RCU is broken.
+	And rcutorture prints the error flag string "!!!" to make sure
+	you notice.  The age of a newly allocated structure is zero,
+	it becomes one when removed from reader visibility, and is
+	incremented once per grace period subsequently -- and is freed
+	after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+	The output displayed above was taken from a correctly working
+	RCU.  If you want to see what it looks like when broken, break
+	it yourself.  ;-)
+
+o	"Reader Batch": Another histogram of "ages" of structures seen
+	by readers, but in terms of counter flips (or batches) rather
+	than in terms of grace periods.  The legal number of non-zero
+	entries is again two.  The reason for this separate view is
+	that it is easier to get the third entry to show up in the
+	"Reader Batch" list than in the "Reader Pipe" list.
+
+o	"Free-Block Circulation": Shows the number of torture structures
+	that have reached a given point in the pipeline.  The first element
+	should closely correspond to the number of structures allocated,
+	the second to the number that have been removed from reader view,
+	and all but the last remaining to the corresponding number of
+	passes through a grace period.  The last entry should be zero,
+	as it is only incremented if a torture structure's counter
+	somehow gets incremented farther than it should.
+
+
+USAGE
+
+The following script may be used to torture RCU:
+
+	#!/bin/sh
+
+	modprobe rcutorture
+	sleep 100
+	rmmod rcutorture
+	dmesg | grep rcutorture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 70191a5a148f..cce25591eec2 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -275,6 +275,7 @@ static inline int rcu_pending(int cpu)
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
diff --git a/kernel/Makefile b/kernel/Makefile
index 980b5e454441..4f5a1453093a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2559d4b8f23f..c4d159a21e04 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -153,6 +153,15 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	local_irq_restore(flags);
 }
 
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -501,6 +510,7 @@ void synchronize_kernel(void)
 }
 
 module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 000000000000..9b58f1eff3ca
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,492 @@
+/*
+ * Read-Copy Update /proc-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcuref.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/stat.h>
+
+MODULE_LICENSE("GPL");
+
+static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
+static int stat_interval = 0;	/* Interval between stats, in seconds. */
+				/*  Defaults to "only at end of test". */
+static int verbose = 0;		/* Print more debug info. */
+
+MODULE_PARM(nreaders, "i");
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+MODULE_PARM(stat_interval, "i");
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+MODULE_PARM(verbose, "i");
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+#define TORTURE_FLAG "rcutorture: "
+#define PRINTK_STRING(s) \
+	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+
+static char printk_buf[4096];
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+	struct rcu_head rtort_rcu;
+	int rtort_pipe_count;
+	struct list_head rtort_free;
+};
+
+static int fullstop = 0;	/* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+	{ 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+	{ 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+atomic_t n_rcu_torture_alloc;
+atomic_t n_rcu_torture_alloc_fail;
+atomic_t n_rcu_torture_free;
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+struct rcu_torture *
+rcu_torture_alloc(void)
+{
+	struct list_head *p;
+
+	spin_lock(&rcu_torture_lock);
+	if (list_empty(&rcu_torture_freelist)) {
+		atomic_inc(&n_rcu_torture_alloc_fail);
+		spin_unlock(&rcu_torture_lock);
+		return NULL;
+	}
+	atomic_inc(&n_rcu_torture_alloc);
+	p = rcu_torture_freelist.next;
+	list_del_init(p);
+	spin_unlock(&rcu_torture_lock);
+	return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+	atomic_inc(&n_rcu_torture_free);
+	spin_lock(&rcu_torture_lock);
+	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+	spin_unlock(&rcu_torture_lock);
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN)
+		rcu_torture_free(rp);
+	else
+		call_rcu(p, rcu_torture_cb);
+}
+
+struct rcu_random_state {
+	unsigned long rrs_state;
+	unsigned long rrs_count;
+};
+
+#define RCU_RANDOM_MULT 39916801  /* prime */
+#define RCU_RANDOM_ADD	479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static long
+rcu_random(struct rcu_random_state *rrsp)
+{
+	long refresh;
+
+	if (--rrsp->rrs_count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rrsp->rrs_state += refresh;
+		rrsp->rrs_count = RCU_RANDOM_REFRESH;
+	}
+	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+	return swahw32(rrsp->rrs_state);
+}
+
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+	int i;
+	long oldbatch = rcu_batches_completed();
+	struct rcu_torture *rp;
+	struct rcu_torture *old_rp;
+	static DEFINE_RCU_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	do {
+		schedule_timeout_uninterruptible(1);
+		if (rcu_batches_completed() == oldbatch)
+			continue;
+		if ((rp = rcu_torture_alloc()) == NULL)
+			continue;
+		rp->rtort_pipe_count = 0;
+		udelay(rcu_random(&rand) & 0x3ff);
+		old_rp = rcu_torture_current;
+		rcu_assign_pointer(rcu_torture_current, rp);
+		smp_wmb();
+		if (old_rp != NULL) {
+			i = old_rp->rtort_pipe_count;
+			if (i > RCU_TORTURE_PIPE_LEN)
+				i = RCU_TORTURE_PIPE_LEN;
+			atomic_inc(&rcu_torture_wcount[i]);
+			old_rp->rtort_pipe_count++;
+			call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+		}
+		rcu_torture_current_version++;
+		oldbatch = rcu_batches_completed();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+	int completed;
+	DEFINE_RCU_RANDOM(rand);
+	struct rcu_torture *p;
+	int pipe_count;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	do {
+		rcu_read_lock();
+		completed = rcu_batches_completed();
+		p = rcu_dereference(rcu_torture_current);
+		if (p == NULL) {
+			/* Wait for rcu_torture_writer to get underway */
+			rcu_read_unlock();
+			schedule_timeout_interruptible(HZ);
+			continue;
+		}
+		udelay(rcu_random(&rand) & 0x7f);
+		preempt_disable();
+		pipe_count = p->rtort_pipe_count;
+		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			pipe_count = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_count)[pipe_count];
+		completed = rcu_batches_completed() - completed;
+		if (completed > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			completed = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_batch)[completed];
+		preempt_enable();
+		rcu_read_unlock();
+		schedule();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int i;
+	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+		}
+	}
+	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+		if (pipesummary[i] != 0)
+			break;
+	}
+	cnt += sprintf(&page[cnt], "rcutorture: ");
+	cnt += sprintf(&page[cnt],
+		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d",
+		       rcu_torture_current,
+		       rcu_torture_current_version,
+		       list_empty(&rcu_torture_freelist),
+		       atomic_read(&n_rcu_torture_alloc),
+		       atomic_read(&n_rcu_torture_alloc_fail),
+		       atomic_read(&n_rcu_torture_free));
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	if (i > 1)
+		cnt += sprintf(&page[cnt], "!!! ");
+	cnt += sprintf(&page[cnt], "Reader Pipe: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Reader Batch: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+		cnt += sprintf(&page[cnt], " %d",
+			       atomic_read(&rcu_torture_wcount[i]));
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+	int cnt;
+
+	cnt = rcu_torture_printk(printk_buf);
+	printk(KERN_ALERT "%s", printk_buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		rcu_torture_stats_print();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	return 0;
+}
+
+static void
+rcu_torture_cleanup(void)
+{
+	int i;
+
+	fullstop = 1;
+	if (writer_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		kthread_stop(writer_task);
+	}
+	writer_task = NULL;
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++) {
+			if (reader_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_reader task");
+				kthread_stop(reader_tasks[i]);
+			}
+			reader_tasks[i] = NULL;
+		}
+		kfree(reader_tasks);
+		reader_tasks = NULL;
+	}
+	rcu_torture_current = NULL;
+
+	if (stats_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		kthread_stop(stats_task);
+	}
+	stats_task = NULL;
+
+	/* Wait for all RCU callbacks to fire.  */
+
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		synchronize_rcu();
+	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+	PRINTK_STRING("--- End of test");
+}
+
+static int
+rcu_torture_init(void)
+{
+	int i;
+	int cpu;
+	int firsterr = 0;
+
+	/* Process args and tell the world that the torturer is on the job. */
+
+	if (nreaders >= 0)
+		nrealreaders = nreaders;
+	else
+		nrealreaders = 2 * num_online_cpus();
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
+	       nrealreaders, stat_interval, verbose);
+	fullstop = 0;
+
+	/* Set up the freelist. */
+
+	INIT_LIST_HEAD(&rcu_torture_freelist);
+	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+		list_add_tail(&rcu_tortures[i].rtort_free,
+			      &rcu_torture_freelist);
+	}
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	rcu_torture_current = NULL;
+	rcu_torture_current_version = 0;
+	atomic_set(&n_rcu_torture_alloc, 0);
+	atomic_set(&n_rcu_torture_alloc_fail, 0);
+	atomic_set(&n_rcu_torture_free, 0);
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		atomic_set(&rcu_torture_wcount[i], 0);
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			per_cpu(rcu_torture_count, cpu)[i] = 0;
+			per_cpu(rcu_torture_batch, cpu)[i] = 0;
+		}
+	}
+
+	/* Start up the kthreads. */
+
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	writer_task = kthread_run(rcu_torture_writer, NULL,
+				  "rcu_torture_writer");
+	if (IS_ERR(writer_task)) {
+		firsterr = PTR_ERR(writer_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		writer_task = NULL;
+		goto unwind;
+	}
+	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+					      "rcu_torture_reader");
+		if (IS_ERR(reader_tasks[i])) {
+			firsterr = PTR_ERR(reader_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			reader_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	if (stat_interval > 0) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		stats_task = kthread_run(rcu_torture_stats, NULL,
+					"rcu_torture_stats");
+		if (IS_ERR(stats_task)) {
+			firsterr = PTR_ERR(stats_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			stats_task = NULL;
+			goto unwind;
+		}
+	}
+	return 0;
+
+unwind:
+	rcu_torture_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f9306b14d2a6..156822e3cc79 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -168,6 +168,14 @@ config DEBUG_FS
 
 	  If unsure, say N.
 
+config DEBUG_VM
+	bool "Debug VM"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to debug the virtual-memory system.
+
+	  If unsure, say N.
+
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
 	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
@@ -178,3 +186,16 @@ config FRAME_POINTER
 	  some architectures or if you use external debuggers.
 	  If you don't debug the kernel, you can say N.
 
+config RCU_TORTURE_TEST
+	tristate "torture tests for RCU"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option provides a kernel module that runs torture tests
+	  on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU torture tests to start automatically
+	  at boot time (you probably don't).
+	  Say M if you want the RCU torture tests to build as a module.
+	  Say N if you are unsure.
diff --git a/mm/mmap.c b/mm/mmap.c
index 5ecc2cf3e1d7..320dda1778c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1840,7 +1840,7 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
 
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_VM
 	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
 		WARN_ON(1);
 		up_read(&mm->mmap_sem);
-- 
cgit v1.2.3


From 708f430dcc50787d1c0b5c31962a5ff0dd8e35eb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Oct 2005 15:03:13 -0800
Subject: [PATCH] posix-cpu-timers: fix overrun reporting

This change corrects an omission in posix_cpu_timer_schedule, so that it
correctly propagates the overrun calculation to where it will get reported
to the user.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bf374fceb39c..91a894264941 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		/*
 		 * The task was cleaned up already, no future firings.
 		 */
-		return;
+		goto out;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state)) {
 			clear_dead_task(timer, now);
-			return;
+			goto out;
 		}
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
 	} else {
@@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			put_task_struct(p);
 			timer->it.cpu.task = p = NULL;
 			timer->it.cpu.expires.sched = 0;
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/*
 			 * We've noticed that the thread is dead, but
@@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * drop our task ref.
 			 */
 			clear_dead_task(timer, now);
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		}
 		cpu_clock_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
@@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	 */
 	arm_timer(timer, now);
 
+out_unlock:
 	read_unlock(&tasklist_lock);
+
+out:
+	timer->it_overrun_last = timer->it_overrun;
+	timer->it_overrun = -1;
+	++timer->it_requeue_pending;
 }
 
 /*
-- 
cgit v1.2.3


From 4098f9918e068e51fed1727f6ba80efcec372378 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:03:21 -0800
Subject: [PATCH] sched: hardcode non-smp set_cpus_allowed

Simplify the UP (1 CPU) implementatin of set_cpus_allowed.

The one CPU is hardcoded to be cpu 0 - so just test for that bit, and avoid
having to pick up the cpu_online_map.

Also, unexport cpu_online_map: it was only needed for set_cpus_allowed().

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b2d2dc14f0b9..41285a0e7258 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -940,7 +940,7 @@ extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
 #else
 static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 {
-	if (!cpus_intersects(new_mask, cpu_online_map))
+	if (!cpu_isset(0, new_mask))
 		return -EINVAL;
 	return 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 4f26c544d02c..340dd238c16d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3877,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map = CPU_MASK_ALL;
-EXPORT_SYMBOL_GPL(cpu_online_map);
 cpumask_t cpu_possible_map = CPU_MASK_ALL;
 #endif
 
-- 
cgit v1.2.3


From b67a1b9e4bf878aa5d4b6b44cb5a251a2f425f0d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:44 -0800
Subject: [PATCH] remove hardcoded SEND_SIG_xxx constants

This patch replaces hardcoded SEND_SIG_xxx constants with
their symbolic names.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c            | 10 +++++-----
 kernel/signal.c          | 35 ++++++++++++++++++++---------------
 security/selinux/hooks.c |  4 ++--
 3 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2d39ccc367e6..537394b25e8d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
 
 	/* Move the child from its dying parent to the new one.  */
 	if (unlikely(traced)) {
@@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
-			__kill_pg_info(SIGHUP, (void *)1, pgrp);
-			__kill_pg_info(SIGCONT, (void *)1, pgrp);
+			__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+			__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 		}
 	}
 }
@@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk)
 	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
-		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
-		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
+		__kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
 	}
 
 	/* Let father know we died 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9d1512dcf176..1f7b2aaa4a39 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -651,8 +651,9 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((!info || ((unsigned long)info != 1 &&
-			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO ||
+			(info != SEND_SIG_PRIV && info != SEND_SIG_FORCED
+				&& SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -789,7 +790,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
 	 */
-	if ((unsigned long)info == 2)
+	if (info == SEND_SIG_FORCED)
 		goto out_set;
 
 	/* Real-time signals must be queued if sent by sigqueue, or
@@ -801,19 +802,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     ((unsigned long) info < 2 ||
+					     (info < SEND_SIG_FORCED ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
 		switch ((unsigned long) info) {
-		case 0:
+		case (unsigned long) SEND_SIG_NOINFO:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
 			q->info.si_pid = current->pid;
 			q->info.si_uid = current->uid;
 			break;
-		case 1:
+		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_KERNEL;
@@ -825,14 +826,15 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		}
 	} else {
-		if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+		if (sig >= SIGRTMIN
+		   && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV
 		   && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
+		if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER))
 			/*
 			 * Set up a return to indicate that we dropped 
 			 * the signal.
@@ -858,7 +860,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -914,7 +916,7 @@ force_sig_specific(int sig, struct task_struct *t)
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
 	sigdelset(&t->blocked, sig);
 	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, (void *)2, t);
+	specific_send_sig_info(sig, SEND_SIG_FORCED, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
@@ -1050,7 +1052,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -1285,10 +1287,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	return ret;
 }
 
+#define __si_special(priv) \
+	((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
+
 int
 send_sig(int sig, struct task_struct *p, int priv)
 {
-	return send_sig_info(sig, (void*)(long)(priv != 0), p);
+	return send_sig_info(sig, __si_special(priv), p);
 }
 
 /*
@@ -1308,7 +1313,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 void
 force_sig(int sig, struct task_struct *p)
 {
-	force_sig_info(sig, (void*)1L, p);
+	force_sig_info(sig, SEND_SIG_PRIV, p);
 }
 
 /*
@@ -1333,13 +1338,13 @@ force_sigsegv(int sig, struct task_struct *p)
 int
 kill_pg(pid_t pgrp, int sig, int priv)
 {
-	return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+	return kill_pg_info(sig, __si_special(priv), pgrp);
 }
 
 int
 kill_proc(pid_t pid, int sig, int priv)
 {
-	return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+	return kill_proc_info(sig, __si_special(priv), pid);
 }
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bb62838be496..295ac472faf1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2713,8 +2713,8 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int si
 	if (rc)
 		return rc;
 
-	if (info && ((unsigned long)info == 1 ||
-	             (unsigned long)info == 2 || SI_FROMKERNEL(info)))
+	if (info != SEND_SIG_NOINFO && (info == SEND_SIG_PRIV ||
+					info == SEND_SIG_FORCED || SI_FROMKERNEL(info)))
 		return 0;
 
 	if (!sig)
-- 
cgit v1.2.3


From 621d31219d9a788bda924a0613048053f3f5f211 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:45 -0800
Subject: [PATCH] cleanup the usage of SEND_SIG_xxx constants

This patch simplifies some checks for magic siginfo values.  It should not
change the behaviour in any way.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h    |  5 +++++
 kernel/signal.c          | 18 +++++++-----------
 security/selinux/hooks.c |  3 +--
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41285a0e7258..03b68a7b4b82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1084,6 +1084,11 @@ extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned lon
 #define SEND_SIG_PRIV	((struct siginfo *) 1)
 #define SEND_SIG_FORCED	((struct siginfo *) 2)
 
+static inline int is_si_special(const struct siginfo *info)
+{
+	return info <= SEND_SIG_FORCED;
+}
+
 /* True if we are on the alternate signal stack.  */
 
 static inline int on_sig_stack(unsigned long sp)
diff --git a/kernel/signal.c b/kernel/signal.c
index 1f7b2aaa4a39..27533b90c347 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -651,9 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((info == SEND_SIG_NOINFO ||
-			(info != SEND_SIG_PRIV && info != SEND_SIG_FORCED
-				&& SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -802,7 +800,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     (info < SEND_SIG_FORCED ||
+					     (is_si_special(info) ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
@@ -825,16 +823,14 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			copy_siginfo(&q->info, info);
 			break;
 		}
-	} else {
-		if (sig >= SIGRTMIN
-		   && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV
-		   && info->si_code != SI_USER)
+	} else if (!is_si_special(info)) {
+		if (sig >= SIGRTMIN && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER))
+		if (info->si_code == SI_TIMER)
 			/*
 			 * Set up a return to indicate that we dropped 
 			 * the signal.
@@ -860,7 +856,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
+	if (!is_si_special(info) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -1052,7 +1048,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
+	if (!is_si_special(info) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 295ac472faf1..45c41490d521 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2713,8 +2713,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int si
 	if (rc)
 		return rc;
 
-	if (info != SEND_SIG_NOINFO && (info == SEND_SIG_PRIV ||
-					info == SEND_SIG_FORCED || SI_FROMKERNEL(info)))
+	if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
 		return 0;
 
 	if (!sig)
-- 
cgit v1.2.3


From ae6866c377943de73e2c95398ff0120516f167ce Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:46 -0800
Subject: [PATCH] remove unneeded SI_TIMER checks

This patch removes checks for ->si_code == SI_TIMER from send_signal,
specific_send_sig_info, __group_send_sig_info.

I think posix-timers.c used these functions some time ago, now it sends
signals via send_{,group_}sigqueue, so these hooks are unneeded.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 27533b90c347..1d8f84c5c6ee 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -830,12 +830,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (info->si_code == SI_TIMER)
-			/*
-			 * Set up a return to indicate that we dropped 
-			 * the signal.
-			 */
-			ret = info->si_sys_private;
 	}
 
 out_set:
@@ -856,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (!is_si_special(info) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
 		goto out;
@@ -1048,12 +1036,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (!is_si_special(info) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig))
 		return ret;
-- 
cgit v1.2.3


From b0423a0d9cc836b2c3d796623cd19236bfedfe63 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 30 Oct 2005 15:03:46 -0800
Subject: [PATCH] Remove duplicate code in signal.c

Combine a bit of redundant code between force_sig_info() and
force_sig_specific().

Signed-off-by: paulmck@us.ibm.com
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1d8f84c5c6ee..1bf3c39d6109 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -879,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	int ret;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+	}
+	if (sigismember(&t->blocked, sig)) {
 		sigdelset(&t->blocked, sig);
-		recalc_sigpending_tsk(t);
 	}
+	recalc_sigpending_tsk(t);
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
@@ -893,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 void
 force_sig_specific(int sig, struct task_struct *t)
 {
-	unsigned long int flags;
-
-	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
-		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
-	sigdelset(&t->blocked, sig);
-	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, SEND_SIG_FORCED, t);
-	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+	force_sig_info(sig, SEND_SIG_FORCED, t);
 }
 
 /*
-- 
cgit v1.2.3


From 4e57b6817880946a3a78d5d8cad1ace363f7e449 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 30 Oct 2005 15:03:48 -0800
Subject: [PATCH] fix missing includes

I recently picked up my older work to remove unnecessary #includes of
sched.h, starting from a patch by Dave Jones to not include sched.h
from module.h. This reduces the number of indirect includes of sched.h
by ~300. Another ~400 pointless direct includes can be removed after
this disentangling (patch to follow later).
However, quite a few indirect includes need to be fixed up for this.

In order to feed the patches through -mm with as little disturbance as
possible, I've split out the fixes I accumulated up to now (complete for
i386 and x86_64, more archs to follow later) and post them before the real
patch.  This way this large part of the patch is kept simple with only
adding #includes, and all hunks are independent of each other.  So if any
hunk rejects or gets in the way of other patches, just drop it.  My scripts
will pick it up again in the next round.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/firmware_class/firmware_sample_driver.c         |  1 +
 Documentation/firmware_class/firmware_sample_firmware_class.c |  2 ++
 arch/arm/common/amba.c                                        |  2 ++
 arch/arm/common/scoop.c                                       |  3 +++
 arch/arm/kernel/arthur.c                                      |  1 +
 arch/arm/mach-imx/generic.c                                   |  2 ++
 arch/arm/mach-integrator/clock.c                              |  1 +
 arch/arm/mach-integrator/integrator_ap.c                      |  1 +
 arch/arm/mach-integrator/lm.c                                 |  1 +
 arch/arm/mach-iop3xx/iq31244-pci.c                            |  2 ++
 arch/arm/mach-iop3xx/iq80321-pci.c                            |  2 ++
 arch/arm/mach-iop3xx/iq80331-pci.c                            |  2 ++
 arch/arm/mach-iop3xx/iq80332-pci.c                            |  2 ++
 arch/arm/mach-pxa/generic.c                                   |  1 +
 arch/arm/mach-sa1100/generic.c                                |  1 +
 arch/arm/mach-versatile/clock.c                               |  1 +
 arch/arm/plat-omap/clock.c                                    |  1 +
 arch/cris/arch-v10/drivers/axisflashmap.c                     |  1 +
 arch/cris/arch-v32/drivers/axisflashmap.c                     |  1 +
 arch/cris/kernel/time.c                                       |  1 +
 arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c                   |  1 +
 arch/i386/kernel/cpu/cpufreq/p4-clockmod.c                    |  1 +
 arch/i386/kernel/cpu/cpufreq/powernow-k8.c                    |  1 +
 arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c             |  1 +
 arch/i386/kernel/cpu/intel_cacheinfo.c                        |  1 +
 arch/ia64/kernel/cyclone.c                                    |  1 +
 arch/m32r/lib/csum_partial_copy.c                             |  2 +-
 arch/mips/sgi-ip27/ip27-berr.c                                |  1 +
 arch/ppc/syslib/of_device.c                                   |  2 ++
 arch/ppc64/kernel/hvcserver.c                                 |  2 ++
 arch/ppc64/kernel/of_device.c                                 |  2 ++
 arch/ppc64/lib/locks.c                                        |  2 ++
 arch/sh/drivers/dma/dma-sysfs.c                               |  1 +
 arch/sh/kernel/cpufreq.c                                      |  1 +
 arch/xtensa/kernel/platform.c                                 |  1 +
 drivers/acpi/processor_idle.c                                 |  1 +
 drivers/base/class.c                                          |  1 +
 drivers/base/platform.c                                       |  1 +
 drivers/base/sys.c                                            |  1 +
 drivers/block/cciss_scsi.c                                    | 10 +++++++---
 drivers/block/paride/paride.c                                 |  1 +
 drivers/block/paride/pg.c                                     |  2 ++
 drivers/block/paride/pt.c                                     |  1 +
 drivers/char/agp/ali-agp.c                                    |  1 +
 drivers/char/agp/amd64-agp.c                                  |  1 +
 drivers/char/agp/ati-agp.c                                    |  2 ++
 drivers/char/agp/i460-agp.c                                   |  2 ++
 drivers/char/agp/isoch.c                                      |  1 +
 drivers/char/agp/sworks-agp.c                                 |  2 ++
 drivers/char/drm/drm_sysfs.c                                  |  2 ++
 drivers/char/mwave/3780i.c                                    |  2 ++
 drivers/char/watchdog/cpu5wdt.c                               |  1 +
 drivers/char/watchdog/mixcomwd.c                              |  2 ++
 drivers/char/watchdog/pcwd.c                                  |  2 +-
 drivers/char/watchdog/sc520_wdt.c                             |  1 +
 drivers/char/watchdog/softdog.c                               |  2 ++
 drivers/infiniband/core/cache.c                               |  1 +
 drivers/infiniband/core/sa_query.c                            |  1 +
 drivers/infiniband/hw/mthca/mthca_av.c                        |  2 ++
 drivers/infiniband/hw/mthca/mthca_mad.c                       |  3 +++
 drivers/infiniband/hw/mthca/mthca_mcg.c                       |  2 ++
 drivers/infiniband/hw/mthca/mthca_profile.c                   |  2 ++
 drivers/infiniband/hw/mthca/mthca_qp.c                        |  2 ++
 drivers/infiniband/hw/mthca/mthca_reset.c                     |  1 +
 drivers/infiniband/hw/mthca/mthca_uar.c                       |  2 ++
 drivers/input/gameport/gameport.c                             |  1 +
 drivers/input/joystick/a3d.c                                  |  1 +
 drivers/input/joystick/adi.c                                  |  1 +
 drivers/input/joystick/analog.c                               |  1 +
 drivers/input/joystick/cobra.c                                |  1 +
 drivers/input/joystick/gf2k.c                                 |  1 +
 drivers/input/joystick/grip.c                                 |  1 +
 drivers/input/joystick/grip_mp.c                              |  1 +
 drivers/input/joystick/guillemot.c                            |  1 +
 drivers/input/joystick/interact.c                             |  1 +
 drivers/input/joystick/joydump.c                              |  1 +
 drivers/input/joystick/sidewinder.c                           |  1 +
 drivers/input/joystick/tmdc.c                                 |  1 +
 drivers/input/serio/hp_sdc_mlc.c                              |  1 +
 drivers/isdn/capi/capifs.c                                    |  1 +
 drivers/macintosh/macio_asic.c                                |  2 ++
 drivers/mca/mca-device.c                                      |  1 +
 drivers/media/common/ir-common.c                              |  1 +
 drivers/media/dvb/dvb-core/dvb_ca_en50221.c                   |  1 +
 drivers/media/dvb/frontends/bcm3510.c                         |  3 +++
 drivers/media/dvb/frontends/dib3000mb.c                       |  2 ++
 drivers/media/dvb/frontends/dib3000mc.c                       |  2 ++
 drivers/media/dvb/frontends/dvb_dummy_fe.c                    |  2 ++
 drivers/media/dvb/frontends/lgdt330x.c                        |  2 ++
 drivers/media/dvb/frontends/mt312.c                           |  2 ++
 drivers/media/dvb/frontends/mt352.c                           |  2 ++
 drivers/media/dvb/frontends/nxt2002.c                         |  2 ++
 drivers/media/dvb/frontends/or51132.c                         |  2 ++
 drivers/media/dvb/frontends/or51211.c                         |  2 ++
 drivers/media/dvb/frontends/s5h1420.c                         |  2 ++
 drivers/media/dvb/frontends/sp8870.c                          |  2 ++
 drivers/media/dvb/frontends/sp887x.c                          |  2 ++
 drivers/media/dvb/frontends/stv0297.c                         |  2 ++
 drivers/media/dvb/frontends/stv0299.c                         |  1 +
 drivers/media/dvb/frontends/tda1004x.c                        |  4 ++++
 drivers/media/dvb/frontends/tda8083.c                         |  1 +
 drivers/media/radio/miropcm20-rds.c                           |  1 +
 drivers/message/i2o/device.c                                  |  2 ++
 drivers/message/i2o/driver.c                                  |  3 +++
 drivers/message/i2o/exec-osm.c                                |  4 ++++
 drivers/message/i2o/iop.c                                     |  1 +
 drivers/mtd/chips/jedec.c                                     |  1 +
 drivers/mtd/devices/lart.c                                    |  1 +
 drivers/mtd/devices/phram.c                                   |  1 +
 drivers/mtd/maps/bast-flash.c                                 |  1 +
 drivers/mtd/maps/ceiva.c                                      |  1 +
 drivers/mtd/maps/dc21285.c                                    |  1 +
 drivers/mtd/maps/dilnetpc.c                                   |  5 ++++-
 drivers/mtd/maps/epxa10db-flash.c                             |  5 ++++-
 drivers/mtd/maps/fortunet.c                                   |  5 ++++-
 drivers/mtd/maps/ixp2000.c                                    |  6 ++++--
 drivers/mtd/maps/ixp4xx.c                                     |  7 +++++--
 drivers/mtd/maps/lubbock-flash.c                              |  3 +++
 drivers/mtd/maps/mainstone-flash.c                            |  3 +++
 drivers/mtd/maps/omap-toto-flash.c                            |  2 +-
 drivers/mtd/maps/omap_nor.c                                   |  2 ++
 drivers/mtd/maps/pci.c                                        |  1 +
 drivers/mtd/maps/plat-ram.c                                   |  1 +
 drivers/mtd/maps/tqm8xxl.c                                    |  4 +++-
 drivers/mtd/mtdblock.c                                        |  1 +
 drivers/mtd/mtdchar.c                                         |  1 +
 drivers/mtd/mtdconcat.c                                       |  2 +-
 drivers/mtd/nand/s3c2410.c                                    |  1 +
 drivers/pci/hotplug/cpcihp_generic.c                          |  1 +
 drivers/pci/hotplug/cpcihp_zt5550.c                           |  1 +
 drivers/pci/hotplug/fakephp.c                                 |  2 ++
 drivers/pci/hotplug/pciehprm_nonacpi.c                        |  3 +++
 drivers/pci/hotplug/rpadlpar_core.c                           |  3 +++
 drivers/pci/hotplug/rpaphp_pci.c                              |  4 +++-
 drivers/pci/hotplug/rpaphp_slot.c                             |  3 +++
 drivers/pci/hotplug/shpchp.h                                  |  2 ++
 drivers/pci/hotplug/shpchprm_nonacpi.c                        |  2 ++
 drivers/pci/pci-driver.c                                      |  2 ++
 drivers/pci/pci.c                                             |  1 +
 drivers/pci/pcie/portdrv_core.c                               |  2 ++
 drivers/pci/pcie/portdrv_pci.c                                |  1 +
 drivers/pci/rom.c                                             |  1 +
 drivers/pnp/manager.c                                         |  2 ++
 drivers/pnp/pnpbios/rsparser.c                                |  2 ++
 drivers/s390/cio/cmf.c                                        |  3 +++
 drivers/s390/cio/device.c                                     |  1 +
 drivers/s390/cio/device_fsm.c                                 |  2 ++
 drivers/scsi/scsi_transport_fc.c                              |  1 +
 drivers/scsi/scsi_transport_iscsi.c                           |  3 +++
 drivers/scsi/sym53c8xx_2/sym_hipd.c                           |  3 +++
 drivers/scsi/sym53c8xx_2/sym_hipd.h                           |  2 ++
 drivers/sh/superhyway/superhyway.c                            |  2 ++
 drivers/usb/host/ohci-omap.c                                  |  3 +++
 drivers/usb/host/ohci-pci.c                                   |  2 ++
 drivers/usb/host/ohci-pxa27x.c                                |  1 +
 drivers/w1/w1_family.c                                        |  1 +
 drivers/zorro/zorro-sysfs.c                                   |  1 +
 drivers/zorro/zorro.c                                         |  2 ++
 fs/filesystems.c                                              |  1 +
 fs/jffs2/background.c                                         |  1 +
 fs/jffs2/wbuf.c                                               |  2 ++
 include/linux/cpufreq.h                                       |  1 +
 include/linux/gameport.h                                      |  1 +
 include/linux/i2c.h                                           |  1 +
 include/linux/i2o.h                                           |  8 ++++++--
 include/linux/kobj_map.h                                      |  2 ++
 include/linux/mtd/map.h                                       |  3 +++
 include/linux/serial.h                                        |  1 +
 include/linux/textsearch.h                                    |  1 +
 include/pcmcia/ss.h                                           |  1 +
 include/scsi/scsi_cmnd.h                                      |  1 +
 include/scsi/scsi_transport_fc.h                              |  1 +
 kernel/kallsyms.c                                             |  1 +
 kernel/kprobes.c                                              |  1 +
 kernel/params.c                                               |  1 +
 lib/kobject.c                                                 |  1 +
 lib/smp_processor_id.c                                        |  1 +
 lib/sort.c                                                    |  1 +
 lib/vsprintf.c                                                |  1 +
 sound/oss/ac97_codec.c                                        |  1 +
 180 files changed, 299 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/firmware_class/firmware_sample_driver.c b/Documentation/firmware_class/firmware_sample_driver.c
index 4bef8c25172c..d3ad2c24490a 100644
--- a/Documentation/firmware_class/firmware_sample_driver.c
+++ b/Documentation/firmware_class/firmware_sample_driver.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/string.h>
 
 #include "linux/firmware.h"
 
diff --git a/Documentation/firmware_class/firmware_sample_firmware_class.c b/Documentation/firmware_class/firmware_sample_firmware_class.c
index 09eab2f1b373..57b956aecbc5 100644
--- a/Documentation/firmware_class/firmware_sample_firmware_class.c
+++ b/Documentation/firmware_class/firmware_sample_firmware_class.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 #include <linux/firmware.h>
 
 
diff --git a/arch/arm/common/amba.c b/arch/arm/common/amba.c
index c6beb751f2a9..e1013112c354 100644
--- a/arch/arm/common/amba.c
+++ b/arch/arm/common/amba.c
@@ -10,6 +10,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index e8356b76d7c6..4af0cf5f3bfb 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -12,6 +12,9 @@
  */
 
 #include <linux/device.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <asm/io.h>
 #include <asm/hardware/scoop.h>
 
diff --git a/arch/arm/kernel/arthur.c b/arch/arm/kernel/arthur.c
index a418dad6692c..0ee2e9819631 100644
--- a/arch/arm/kernel/arthur.c
+++ b/arch/arm/kernel/arthur.c
@@ -18,6 +18,7 @@
 #include <linux/stddef.h>
 #include <linux/signal.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <asm/ptrace.h>
 
diff --git a/arch/arm/mach-imx/generic.c b/arch/arm/mach-imx/generic.c
index cb14b0682cef..837d7f0bda4c 100644
--- a/arch/arm/mach-imx/generic.c
+++ b/arch/arm/mach-imx/generic.c
@@ -26,6 +26,8 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+
 #include <asm/arch/imxfb.h>
 #include <asm/hardware.h>
 #include <asm/arch/imx-regs.h>
diff --git a/arch/arm/mach-integrator/clock.c b/arch/arm/mach-integrator/clock.c
index 56200594db3c..73c360685cad 100644
--- a/arch/arm/mach-integrator/clock.c
+++ b/arch/arm/mach-integrator/clock.c
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/semaphore.h>
 #include <asm/hardware/clock.h>
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index f368b85f0447..764ceb49470a 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -30,6 +30,7 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/setup.h>
+#include <asm/param.h>		/* HZ */
 #include <asm/mach-types.h>
 #include <asm/hardware/amba.h>
 #include <asm/hardware/amba_kmi.h>
diff --git a/arch/arm/mach-integrator/lm.c b/arch/arm/mach-integrator/lm.c
index c5f19d160598..5b41e3a724e1 100644
--- a/arch/arm/mach-integrator/lm.c
+++ b/arch/arm/mach-integrator/lm.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 
 #include <asm/arch/lm.h>
 
diff --git a/arch/arm/mach-iop3xx/iq31244-pci.c b/arch/arm/mach-iop3xx/iq31244-pci.c
index f997daa800bf..c6a973ba8fc6 100644
--- a/arch/arm/mach-iop3xx/iq31244-pci.c
+++ b/arch/arm/mach-iop3xx/iq31244-pci.c
@@ -14,6 +14,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-iop3xx/iq80321-pci.c b/arch/arm/mach-iop3xx/iq80321-pci.c
index 79fea3d20b66..802f6d091b75 100644
--- a/arch/arm/mach-iop3xx/iq80321-pci.c
+++ b/arch/arm/mach-iop3xx/iq80321-pci.c
@@ -14,6 +14,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-iop3xx/iq80331-pci.c b/arch/arm/mach-iop3xx/iq80331-pci.c
index f37a0e26b466..654e450a1311 100644
--- a/arch/arm/mach-iop3xx/iq80331-pci.c
+++ b/arch/arm/mach-iop3xx/iq80331-pci.c
@@ -13,6 +13,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-iop3xx/iq80332-pci.c b/arch/arm/mach-iop3xx/iq80332-pci.c
index b9807aa2aade..65951ffe4631 100644
--- a/arch/arm/mach-iop3xx/iq80332-pci.c
+++ b/arch/arm/mach-iop3xx/iq80332-pci.c
@@ -13,6 +13,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-pxa/generic.c b/arch/arm/mach-pxa/generic.c
index 3248bc9b9495..9c0289333301 100644
--- a/arch/arm/mach-pxa/generic.c
+++ b/arch/arm/mach-pxa/generic.c
@@ -23,6 +23,7 @@
 #include <linux/device.h>
 #include <linux/ioport.h>
 #include <linux/pm.h>
+#include <linux/string.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index f94b0fbcdcc8..83eba8b54816 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -17,6 +17,7 @@
 #include <linux/pm.h>
 #include <linux/cpufreq.h>
 #include <linux/ioport.h>
+#include <linux/sched.h>	/* just for sched_clock() - funny that */
 
 #include <asm/div64.h>
 #include <asm/hardware.h>
diff --git a/arch/arm/mach-versatile/clock.c b/arch/arm/mach-versatile/clock.c
index 48025c2b9987..b96a2ea15d41 100644
--- a/arch/arm/mach-versatile/clock.c
+++ b/arch/arm/mach-versatile/clock.c
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/semaphore.h>
 #include <asm/hardware/clock.h>
diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c
index 52a58b2da288..a020fe16428f 100644
--- a/arch/arm/plat-omap/clock.c
+++ b/arch/arm/plat-omap/clock.c
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>
diff --git a/arch/cris/arch-v10/drivers/axisflashmap.c b/arch/cris/arch-v10/drivers/axisflashmap.c
index 11ab3836aac6..56b038c8d482 100644
--- a/arch/cris/arch-v10/drivers/axisflashmap.c
+++ b/arch/cris/arch-v10/drivers/axisflashmap.c
@@ -140,6 +140,7 @@
 #include <linux/kernel.h>
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/concat.h>
 #include <linux/mtd/map.h>
diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index 78ed52b1cdac..b679f983b90a 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/concat.h>
 #include <linux/mtd/map.h>
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index b863815de78d..66ba8898db07 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -31,6 +31,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/profile.h>
+#include <linux/sched.h>	/* just for sched_clock() - funny that */
 
 int have_rtc;  /* used to remember if we have an RTC or not */;
 
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 822c8ce9d1f1..caa9f7711343 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -32,6 +32,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/compiler.h>
+#include <linux/sched.h>	/* current */
 #include <asm/io.h>
 #include <asm/delay.h>
 #include <asm/uaccess.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
index aa622d52c6e5..270f2188d68b 100644
--- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -28,6 +28,7 @@
 #include <linux/cpufreq.h>
 #include <linux/slab.h>
 #include <linux/cpumask.h>
+#include <linux/sched.h>	/* current / set_cpus_allowed() */
 
 #include <asm/processor.h> 
 #include <asm/msr.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 58ca98fdc2ca..2d5c9adba0cd 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cpumask.h>
+#include <linux/sched.h>	/* for current / set_cpus_allowed() */
 
 #include <asm/msr.h>
 #include <asm/io.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
index c397b6220430..1465974256c9 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/cpufreq.h>
 #include <linux/config.h>
+#include <linux/sched.h>	/* current */
 #include <linux/delay.h>
 #include <linux/compiler.h>
 
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index f0839334881c..4dc42a189ae5 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -11,6 +11,7 @@
 #include <linux/device.h>
 #include <linux/compiler.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/smp.h>
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 768c7e46957c..6ade3790ce07 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -2,6 +2,7 @@
 #include <linux/smp.h>
 #include <linux/time.h>
 #include <linux/errno.h>
+#include <linux/timex.h>
 #include <asm/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
diff --git a/arch/m32r/lib/csum_partial_copy.c b/arch/m32r/lib/csum_partial_copy.c
index ddb16a83a8ce..3d5f06145854 100644
--- a/arch/m32r/lib/csum_partial_copy.c
+++ b/arch/m32r/lib/csum_partial_copy.c
@@ -18,10 +18,10 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/string.h>
 
 #include <net/checksum.h>
 #include <asm/byteorder.h>
-#include <asm/string.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index e1829a5d3b19..07631a97670b 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/signal.h>	/* for SIGBUS */
 
 #include <asm/module.h>
 #include <asm/sn/addrs.h>
diff --git a/arch/ppc/syslib/of_device.c b/arch/ppc/syslib/of_device.c
index 93c7231ea709..85b821251635 100644
--- a/arch/ppc/syslib/of_device.c
+++ b/arch/ppc/syslib/of_device.c
@@ -4,6 +4,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
 #include <asm/errno.h>
 #include <asm/of_device.h>
 
diff --git a/arch/ppc64/kernel/hvcserver.c b/arch/ppc64/kernel/hvcserver.c
index bde8f42da854..4d584172055a 100644
--- a/arch/ppc64/kernel/hvcserver.c
+++ b/arch/ppc64/kernel/hvcserver.c
@@ -22,6 +22,8 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+
 #include <asm/hvcall.h>
 #include <asm/hvcserver.h>
 #include <asm/io.h>
diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c
index 9f200f0f2ad5..3aabfd0d3dda 100644
--- a/arch/ppc64/kernel/of_device.c
+++ b/arch/ppc64/kernel/of_device.c
@@ -4,6 +4,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
 #include <asm/errno.h>
 #include <asm/of_device.h>
 
diff --git a/arch/ppc64/lib/locks.c b/arch/ppc64/lib/locks.c
index 033643ab69e0..d622c1d58e4e 100644
--- a/arch/ppc64/lib/locks.c
+++ b/arch/ppc64/lib/locks.c
@@ -17,6 +17,8 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
+#include <linux/smp.h>
+
 #include <asm/hvcall.h>
 #include <asm/iSeries/HvCall.h>
 
diff --git a/arch/sh/drivers/dma/dma-sysfs.c b/arch/sh/drivers/dma/dma-sysfs.c
index 71a6d4e7809f..6e3b58bd8795 100644
--- a/arch/sh/drivers/dma/dma-sysfs.c
+++ b/arch/sh/drivers/dma/dma-sysfs.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/sysdev.h>
 #include <linux/module.h>
+#include <linux/string.h>
 #include <asm/dma.h>
 
 static struct sysdev_class dma_sysclass = {
diff --git a/arch/sh/kernel/cpufreq.c b/arch/sh/kernel/cpufreq.c
index e0b384bef55f..47abf6e49dfb 100644
--- a/arch/sh/kernel/cpufreq.c
+++ b/arch/sh/kernel/cpufreq.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
+#include <linux/sched.h>	/* set_cpus_allowed() */
 
 #include <asm/processor.h>
 #include <asm/watchdog.h>
diff --git a/arch/xtensa/kernel/platform.c b/arch/xtensa/kernel/platform.c
index 03674daabc66..a17930747f20 100644
--- a/arch/xtensa/kernel/platform.c
+++ b/arch/xtensa/kernel/platform.c
@@ -18,6 +18,7 @@
 #include <linux/time.h>
 #include <asm/platform.h>
 #include <asm/timex.h>
+#include <asm/param.h>		/* HZ */
 
 #define _F(r,f,a,b)							\
 	r __platform_##f a b;                                   	\
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 26a3a4016115..161db4acfb91 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -37,6 +37,7 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
+#include <linux/sched.h>	/* need_resched() */
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/base/class.c b/drivers/base/class.c
index c3e569730afe..db65fd0babe9 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/kdev_t.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include "base.h"
 
 #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 75ce8711bca5..08d9cc99c7de 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -16,6 +16,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #include "base.h"
 
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 3431eb6004c3..66ed8f2fece5 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pm.h>
+#include <asm/semaphore.h>
 
 extern struct subsystem devices_subsys;
 
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e183a3ef7839..ec27976a57da 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -28,13 +28,17 @@
    through the array controller.  Note in particular, neither 
    physical nor logical disks are presented through the scsi layer. */
 
+#include <linux/timer.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <asm/atomic.h>
+
 #include <scsi/scsi.h> 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h> 
-#include <asm/atomic.h>
-#include <linux/timer.h>
-#include <linux/completion.h>
 
 #include "cciss_scsi.h"
 
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
index 1fef136c0e41..ce94aa11f6a7 100644
--- a/drivers/block/paride/paride.c
+++ b/drivers/block/paride/paride.c
@@ -29,6 +29,7 @@
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/sched.h>	/* TASK_* */
 
 #ifdef CONFIG_PARPORT_MODULE
 #define CONFIG_PARPORT
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 82f2d6d2eeef..6f5df0fad703 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -162,6 +162,8 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/mtio.h>
 #include <linux/pg.h>
 #include <linux/device.h>
+#include <linux/sched.h>	/* current, TASK_* */
+#include <linux/jiffies.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 686c95573452..715ae5dc88fb 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -146,6 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
 #include <linux/slab.h>
 #include <linux/mtio.h>
 #include <linux/device.h>
+#include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 9c9c9c2247ce..b02fc2267159 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -7,6 +7,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/agp_backend.h>
+#include <asm/page.h>		/* PAGE_SIZE */
 #include "agp.h"
 
 #define ALI_AGPCTRL	0xb8
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 0a7624a9b1c1..0e6c3a31d344 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -13,6 +13,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/agp_backend.h>
+#include <asm/page.h>		/* PAGE_SIZE */
 #include "agp.h"
 
 /* Will need to be increased if AMD64 ever goes >8-way. */
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index e572ced9100a..0b6e72642d6e 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -6,6 +6,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/agp_backend.h>
 #include <asm/agp.h>
 #include "agp.h"
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index 94943298c03e..a2d9e5e48bbe 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -10,6 +10,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/agp_backend.h>
 
 #include "agp.h"
diff --git a/drivers/char/agp/isoch.c b/drivers/char/agp/isoch.c
index c9ac731504f2..40083241804e 100644
--- a/drivers/char/agp/isoch.c
+++ b/drivers/char/agp/isoch.c
@@ -6,6 +6,7 @@
 #include <linux/pci.h>
 #include <linux/agp_backend.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 
 #include "agp.h"
 
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index a9fb12c20eb7..71ea59a1dbeb 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -5,6 +5,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/agp_backend.h>
 #include "agp.h"
 
diff --git a/drivers/char/drm/drm_sysfs.c b/drivers/char/drm/drm_sysfs.c
index 475cc5e555e1..6d3449761914 100644
--- a/drivers/char/drm/drm_sysfs.c
+++ b/drivers/char/drm/drm_sysfs.c
@@ -15,6 +15,8 @@
 #include <linux/device.h>
 #include <linux/kdev_t.h>
 #include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 
 #include "drm_core.h"
 #include "drmP.h"
diff --git a/drivers/char/mwave/3780i.c b/drivers/char/mwave/3780i.c
index 613aed9e1840..d1fe05e83882 100644
--- a/drivers/char/mwave/3780i.c
+++ b/drivers/char/mwave/3780i.c
@@ -53,6 +53,8 @@
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>	/* cond_resched() */
+
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/drivers/char/watchdog/cpu5wdt.c b/drivers/char/watchdog/cpu5wdt.c
index 2865dac0a813..e75045fe2641 100644
--- a/drivers/char/watchdog/cpu5wdt.c
+++ b/drivers/char/watchdog/cpu5wdt.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/timer.h>
+#include <linux/jiffies.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/char/watchdog/mixcomwd.c b/drivers/char/watchdog/mixcomwd.c
index 7fc2188386d9..d8dede575402 100644
--- a/drivers/char/watchdog/mixcomwd.c
+++ b/drivers/char/watchdog/mixcomwd.c
@@ -45,6 +45,8 @@
 #include <linux/fs.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
diff --git a/drivers/char/watchdog/pcwd.c b/drivers/char/watchdog/pcwd.c
index 427ad51b7a35..37c9e13ad3ac 100644
--- a/drivers/char/watchdog/pcwd.c
+++ b/drivers/char/watchdog/pcwd.c
@@ -66,7 +66,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/reboot.h>
-
+#include <linux/sched.h>	/* TASK_INTERRUPTIBLE, set_current_state() and friends */
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
diff --git a/drivers/char/watchdog/sc520_wdt.c b/drivers/char/watchdog/sc520_wdt.c
index 72501be79b0c..4ee9974ad8cb 100644
--- a/drivers/char/watchdog/sc520_wdt.c
+++ b/drivers/char/watchdog/sc520_wdt.c
@@ -63,6 +63,7 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/char/watchdog/softdog.c b/drivers/char/watchdog/softdog.c
index 20e5eb8667f2..a91edaf3a350 100644
--- a/drivers/char/watchdog/softdog.c
+++ b/drivers/char/watchdog/softdog.c
@@ -47,6 +47,8 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+
 #include <asm/uaccess.h>
 
 #define PFX "SoftDog: "
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index f014e639088c..c57a3871184c 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/sched.h>	/* INIT_WORK, schedule_work(), flush_scheduled_work() */
 
 #include <rdma/ib_cache.h>
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 89ce9dc210d4..acda7d63d6fe 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -43,6 +43,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/kref.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #include <rdma/ib_pack.h>
 #include <rdma/ib_sa.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 889e85096736..22fdc446f25c 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -34,6 +34,8 @@
  */
 
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 8561b297a19b..1229c604c6e0 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -34,6 +34,9 @@
  * $Id: mthca_mad.c 1349 2004-12-16 21:09:43Z roland $
  */
 
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_smi.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
index b47ea7daf088..2fc449da418d 100644
--- a/drivers/infiniband/hw/mthca/mthca_mcg.c
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -33,6 +33,8 @@
  */
 
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
index 0576056b34f4..bd1338682074 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.c
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -35,6 +35,8 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "mthca_profile.h"
 
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 62ff091505da..7c9afde5ace5 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -36,6 +36,8 @@
  */
 
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
index 4f995391dd1d..df5e494a9d38 100644
--- a/drivers/infiniband/hw/mthca/mthca_reset.c
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -37,6 +37,7 @@
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
index 1c8791ded6ff..8e9219842be4 100644
--- a/drivers/infiniband/hw/mthca/mthca_uar.c
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -32,6 +32,8 @@
  * $Id$
  */
 
+#include <asm/page.h>		/* PAGE_SHIFT */
+
 #include "mthca_dev.h"
 #include "mthca_memfree.h"
 
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index ab09cf4093e3..0506934244f0 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/sched.h>	/* HZ */
 
 /*#include <asm/io.h>*/
 
diff --git a/drivers/input/joystick/a3d.c b/drivers/input/joystick/a3d.c
index bf65430181fa..4571ea3a4b92 100644
--- a/drivers/input/joystick/a3d.c
+++ b/drivers/input/joystick/a3d.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"FP-Gaming Assasin 3D joystick driver"
 
diff --git a/drivers/input/joystick/adi.c b/drivers/input/joystick/adi.c
index 9d95459f4bcb..704bf70f1db7 100644
--- a/drivers/input/joystick/adi.c
+++ b/drivers/input/joystick/adi.c
@@ -34,6 +34,7 @@
 #include <linux/input.h>
 #include <linux/gameport.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Logitech ADI joystick family driver"
 
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index c75ac6eb1ffb..3121961e3e7c 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/gameport.h>
+#include <linux/jiffies.h>
 #include <asm/timex.h>
 
 #define DRIVER_DESC	"Analog joystick and gamepad driver"
diff --git a/drivers/input/joystick/cobra.c b/drivers/input/joystick/cobra.c
index 9a3dfc724a41..1909f7ef340c 100644
--- a/drivers/input/joystick/cobra.c
+++ b/drivers/input/joystick/cobra.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Creative Labs Blaster GamePad Cobra driver"
 
diff --git a/drivers/input/joystick/gf2k.c b/drivers/input/joystick/gf2k.c
index e151f8c5bcb9..8a3ad455eb38 100644
--- a/drivers/input/joystick/gf2k.c
+++ b/drivers/input/joystick/gf2k.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/gameport.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Genius Flight 2000 joystick driver"
 
diff --git a/drivers/input/joystick/grip.c b/drivers/input/joystick/grip.c
index e206bb56e53c..a936e7aedb10 100644
--- a/drivers/input/joystick/grip.c
+++ b/drivers/input/joystick/grip.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Gravis GrIP protocol joystick driver"
 
diff --git a/drivers/input/joystick/grip_mp.c b/drivers/input/joystick/grip_mp.c
index a0ba93ccac72..51a912222e85 100644
--- a/drivers/input/joystick/grip_mp.c
+++ b/drivers/input/joystick/grip_mp.c
@@ -19,6 +19,7 @@
 #include <linux/input.h>
 #include <linux/delay.h>
 #include <linux/proc_fs.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Gravis Grip Multiport driver"
 
diff --git a/drivers/input/joystick/guillemot.c b/drivers/input/joystick/guillemot.c
index c528473c09d8..6e2c721c26ba 100644
--- a/drivers/input/joystick/guillemot.c
+++ b/drivers/input/joystick/guillemot.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Guillemot Digital joystick driver"
 
diff --git a/drivers/input/joystick/interact.c b/drivers/input/joystick/interact.c
index 8511ee7bb263..c4ed01758226 100644
--- a/drivers/input/joystick/interact.c
+++ b/drivers/input/joystick/interact.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"InterAct digital joystick driver"
 
diff --git a/drivers/input/joystick/joydump.c b/drivers/input/joystick/joydump.c
index 4234ccaf9146..88ec5a918f2e 100644
--- a/drivers/input/joystick/joydump.c
+++ b/drivers/input/joystick/joydump.c
@@ -34,6 +34,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #define DRIVER_DESC	"Gameport data dumper module"
 
diff --git a/drivers/input/joystick/sidewinder.c b/drivers/input/joystick/sidewinder.c
index eaaad45cc750..78dd163cd702 100644
--- a/drivers/input/joystick/sidewinder.c
+++ b/drivers/input/joystick/sidewinder.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/gameport.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Microsoft SideWinder joystick family driver"
 
diff --git a/drivers/input/joystick/tmdc.c b/drivers/input/joystick/tmdc.c
index 3a7d1bb46472..60e2aac7d06e 100644
--- a/drivers/input/joystick/tmdc.c
+++ b/drivers/input/joystick/tmdc.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"ThrustMaster DirectConnect joystick driver"
 
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index e3c44ffae674..1c9426fd5205 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -40,6 +40,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <asm/semaphore.h>
 
 #define PREFIX "HP SDC MLC: "
 
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 3abd7fc6e5ef..7b564c0dd996 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/ctype.h>
+#include <linux/sched.h>	/* current */
 
 MODULE_DESCRIPTION("CAPI4Linux: /dev/capi/ filesystem");
 MODULE_AUTHOR("Carsten Paeth");
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 1ee003346923..c34c96d18907 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -17,6 +17,8 @@
 #include <linux/pci_ids.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+
 #include <asm/machdep.h>
 #include <asm/macio.h>
 #include <asm/pmac_feature.h>
diff --git a/drivers/mca/mca-device.c b/drivers/mca/mca-device.c
index 76d430aa243f..e7adf89fae41 100644
--- a/drivers/mca/mca-device.c
+++ b/drivers/mca/mca-device.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/mca.h>
+#include <linux/string.h>
 
 /**
  *	mca_device_read_stored_pos - read POS register from stored data
diff --git a/drivers/media/common/ir-common.c b/drivers/media/common/ir-common.c
index 06f4d4686a6c..31fccb4f05d6 100644
--- a/drivers/media/common/ir-common.c
+++ b/drivers/media/common/ir-common.c
@@ -22,6 +22,7 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
 #include <media/ir-common.h>
 
 /* -------------------------------------------------------------------------- */
diff --git a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
index 88757e2634e5..2aa767f9bd7d 100644
--- a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
@@ -36,6 +36,7 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/rwsem.h>
+#include <linux/sched.h>
 
 #include "dvb_ca_en50221.h"
 #include "dvb_ringbuffer.h"
diff --git a/drivers/media/dvb/frontends/bcm3510.c b/drivers/media/dvb/frontends/bcm3510.c
index f5fdc5c3e605..f6d4ee78bdd4 100644
--- a/drivers/media/dvb/frontends/bcm3510.c
+++ b/drivers/media/dvb/frontends/bcm3510.c
@@ -36,6 +36,9 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "bcm3510.h"
diff --git a/drivers/media/dvb/frontends/dib3000mb.c b/drivers/media/dvb/frontends/dib3000mb.c
index 21433e1831e7..6b0553608610 100644
--- a/drivers/media/dvb/frontends/dib3000mb.c
+++ b/drivers/media/dvb/frontends/dib3000mb.c
@@ -27,6 +27,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dib3000-common.h"
 #include "dib3000mb_priv.h"
diff --git a/drivers/media/dvb/frontends/dib3000mc.c b/drivers/media/dvb/frontends/dib3000mc.c
index 441de665fec3..c024fad17337 100644
--- a/drivers/media/dvb/frontends/dib3000mc.c
+++ b/drivers/media/dvb/frontends/dib3000mc.c
@@ -26,6 +26,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dib3000-common.h"
 #include "dib3000mc_priv.h"
diff --git a/drivers/media/dvb/frontends/dvb_dummy_fe.c b/drivers/media/dvb/frontends/dvb_dummy_fe.c
index cff93b9d8ab2..794be520d590 100644
--- a/drivers/media/dvb/frontends/dvb_dummy_fe.c
+++ b/drivers/media/dvb/frontends/dvb_dummy_fe.c
@@ -22,6 +22,8 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "dvb_dummy_fe.h"
diff --git a/drivers/media/dvb/frontends/lgdt330x.c b/drivers/media/dvb/frontends/lgdt330x.c
index 7142b9c51dd2..8dde72bd1046 100644
--- a/drivers/media/dvb/frontends/lgdt330x.c
+++ b/drivers/media/dvb/frontends/lgdt330x.c
@@ -37,6 +37,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/mt312.c b/drivers/media/dvb/frontends/mt312.c
index e455aecd76b2..e38454901dd1 100644
--- a/drivers/media/dvb/frontends/mt312.c
+++ b/drivers/media/dvb/frontends/mt312.c
@@ -29,6 +29,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "mt312_priv.h"
diff --git a/drivers/media/dvb/frontends/mt352.c b/drivers/media/dvb/frontends/mt352.c
index cc1bc0edd65e..f0c610f2c2df 100644
--- a/drivers/media/dvb/frontends/mt352.c
+++ b/drivers/media/dvb/frontends/mt352.c
@@ -35,6 +35,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "mt352_priv.h"
diff --git a/drivers/media/dvb/frontends/nxt2002.c b/drivers/media/dvb/frontends/nxt2002.c
index 35a1d60f1927..30786b1911bd 100644
--- a/drivers/media/dvb/frontends/nxt2002.c
+++ b/drivers/media/dvb/frontends/nxt2002.c
@@ -32,6 +32,8 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "nxt2002.h"
diff --git a/drivers/media/dvb/frontends/or51132.c b/drivers/media/dvb/frontends/or51132.c
index b6d0eecc59eb..817b044c7fd1 100644
--- a/drivers/media/dvb/frontends/or51132.c
+++ b/drivers/media/dvb/frontends/or51132.c
@@ -36,6 +36,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/or51211.c b/drivers/media/dvb/frontends/or51211.c
index ad56a9958404..8a9db23dd1b7 100644
--- a/drivers/media/dvb/frontends/or51211.c
+++ b/drivers/media/dvb/frontends/or51211.c
@@ -34,6 +34,8 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/s5h1420.c b/drivers/media/dvb/frontends/s5h1420.c
index c7fe27fd530c..f265418e3261 100644
--- a/drivers/media/dvb/frontends/s5h1420.c
+++ b/drivers/media/dvb/frontends/s5h1420.c
@@ -26,6 +26,8 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <asm/div64.h>
 
 #include "dvb_frontend.h"
 #include "s5h1420.h"
diff --git a/drivers/media/dvb/frontends/sp8870.c b/drivers/media/dvb/frontends/sp8870.c
index 764a95a2e212..1c6b2e9264bc 100644
--- a/drivers/media/dvb/frontends/sp8870.c
+++ b/drivers/media/dvb/frontends/sp8870.c
@@ -32,6 +32,8 @@
 #include <linux/device.h>
 #include <linux/firmware.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "sp8870.h"
diff --git a/drivers/media/dvb/frontends/sp887x.c b/drivers/media/dvb/frontends/sp887x.c
index d868a6927a16..73384e75625e 100644
--- a/drivers/media/dvb/frontends/sp887x.c
+++ b/drivers/media/dvb/frontends/sp887x.c
@@ -14,6 +14,8 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "sp887x.h"
diff --git a/drivers/media/dvb/frontends/stv0297.c b/drivers/media/dvb/frontends/stv0297.c
index 8d09afd7545d..6122ba754bc5 100644
--- a/drivers/media/dvb/frontends/stv0297.c
+++ b/drivers/media/dvb/frontends/stv0297.c
@@ -24,6 +24,8 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "stv0297.h"
diff --git a/drivers/media/dvb/frontends/stv0299.c b/drivers/media/dvb/frontends/stv0299.c
index 2d62931f20b5..889d9257215d 100644
--- a/drivers/media/dvb/frontends/stv0299.c
+++ b/drivers/media/dvb/frontends/stv0299.c
@@ -48,6 +48,7 @@
 #include <linux/moduleparam.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include <asm/div64.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/tda1004x.c b/drivers/media/dvb/frontends/tda1004x.c
index 74cea9f8d721..3529c618f828 100644
--- a/drivers/media/dvb/frontends/tda1004x.c
+++ b/drivers/media/dvb/frontends/tda1004x.c
@@ -32,6 +32,10 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/device.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include "dvb_frontend.h"
 #include "tda1004x.h"
 
diff --git a/drivers/media/dvb/frontends/tda8083.c b/drivers/media/dvb/frontends/tda8083.c
index 168e013d23bd..c05cf1861051 100644
--- a/drivers/media/dvb/frontends/tda8083.c
+++ b/drivers/media/dvb/frontends/tda8083.c
@@ -30,6 +30,7 @@
 #include <linux/moduleparam.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include "dvb_frontend.h"
 #include "tda8083.h"
 
diff --git a/drivers/media/radio/miropcm20-rds.c b/drivers/media/radio/miropcm20-rds.c
index df79d5e0aaed..e09214082e01 100644
--- a/drivers/media/radio/miropcm20-rds.c
+++ b/drivers/media/radio/miropcm20-rds.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
+#include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
 #include <linux/delay.h>
 #include <asm/uaccess.h>
 #include "miropcm20-rds-core.h"
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index d9879965eb50..8eb50cdb8ae1 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -16,6 +16,8 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "core.h"
 
 /**
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 0079a4be0af2..0fb9c4e2ad4c 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -17,6 +17,9 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <linux/i2o.h>
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "core.h"
 
 #define OSM_NAME	"i2o"
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index bda2c62648ba..b675b4ebbebd 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -30,6 +30,10 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/param.h>		/* HZ */
 #include "core.h"
 
 #define OSM_NAME "exec-osm"
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 361da8d1d5e7..61b837de4b6a 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include "core.h"
 
 #define OSM_NAME	"i2o"
diff --git a/drivers/mtd/chips/jedec.c b/drivers/mtd/chips/jedec.c
index 62d235a9a4e2..4f6778f3ee3e 100644
--- a/drivers/mtd/chips/jedec.c
+++ b/drivers/mtd/chips/jedec.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/mtd/jedec.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c
index dfd335e4a2a8..df987a53ed9c 100644
--- a/drivers/mtd/devices/lart.c
+++ b/drivers/mtd/devices/lart.c
@@ -44,6 +44,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/errno.h>
+#include <linux/string.h>
 #include <linux/mtd/mtd.h>
 #ifdef HAVE_PARTITIONS
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c
index a423a382095a..765c0179c8df 100644
--- a/drivers/mtd/devices/phram.c
+++ b/drivers/mtd/devices/phram.c
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 
 #define ERROR(fmt, args...) printk(KERN_ERR "phram: " fmt , ## args)
diff --git a/drivers/mtd/maps/bast-flash.c b/drivers/mtd/maps/bast-flash.c
index 0ba0ff7d43b9..63104c73ca3c 100644
--- a/drivers/mtd/maps/bast-flash.c
+++ b/drivers/mtd/maps/bast-flash.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/ceiva.c b/drivers/mtd/maps/ceiva.c
index da8584a662f4..c68b31dc7e6d 100644
--- a/drivers/mtd/maps/ceiva.c
+++ b/drivers/mtd/maps/ceiva.c
@@ -20,6 +20,7 @@
 #include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/dc21285.c b/drivers/mtd/maps/dc21285.c
index 938c41f2f056..e5b74169fde6 100644
--- a/drivers/mtd/maps/dc21285.c
+++ b/drivers/mtd/maps/dc21285.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/dilnetpc.c b/drivers/mtd/maps/dilnetpc.c
index 0bc79c93a584..f99519692cb7 100644
--- a/drivers/mtd/maps/dilnetpc.c
+++ b/drivers/mtd/maps/dilnetpc.c
@@ -30,12 +30,15 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/string.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/concat.h>
 
+#include <asm/io.h>
+
 /*
 ** The DIL/NetPC keeps its BIOS in two distinct flash blocks.
 ** Destroying any of these blocks transforms the DNPC into
diff --git a/drivers/mtd/maps/epxa10db-flash.c b/drivers/mtd/maps/epxa10db-flash.c
index ab6dbe2b8cce..1df6188926b3 100644
--- a/drivers/mtd/maps/epxa10db-flash.c
+++ b/drivers/mtd/maps/epxa10db-flash.c
@@ -27,12 +27,15 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/slab.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
+#include <asm/io.h>
 #include <asm/hardware.h>
+
 #ifdef CONFIG_EPXA10DB
 #define BOARD_NAME "EPXA10DB"
 #else
diff --git a/drivers/mtd/maps/fortunet.c b/drivers/mtd/maps/fortunet.c
index 068bb6a54520..00f7bbe5479e 100644
--- a/drivers/mtd/maps/fortunet.c
+++ b/drivers/mtd/maps/fortunet.c
@@ -7,11 +7,14 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/string.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
+#include <asm/io.h>
+
 #define MAX_NUM_REGIONS		4
 #define MAX_NUM_PARTITIONS	8
 
diff --git a/drivers/mtd/maps/ixp2000.c b/drivers/mtd/maps/ixp2000.c
index a9f86c7fbd52..1e5d6e1d05f3 100644
--- a/drivers/mtd/maps/ixp2000.c
+++ b/drivers/mtd/maps/ixp2000.c
@@ -22,11 +22,13 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/device.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
-#include <linux/ioport.h>
-#include <linux/device.h>
 
 #include <asm/io.h>
 #include <asm/hardware.h>
diff --git a/drivers/mtd/maps/ixp4xx.c b/drivers/mtd/maps/ixp4xx.c
index 3fcc32884074..da316e543237 100644
--- a/drivers/mtd/maps/ixp4xx.c
+++ b/drivers/mtd/maps/ixp4xx.c
@@ -20,11 +20,14 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/device.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
-#include <linux/ioport.h>
-#include <linux/device.h>
+
 #include <asm/io.h>
 #include <asm/mach/flash.h>
 
diff --git a/drivers/mtd/maps/lubbock-flash.c b/drivers/mtd/maps/lubbock-flash.c
index 1298de475c9a..2337e0c46750 100644
--- a/drivers/mtd/maps/lubbock-flash.c
+++ b/drivers/mtd/maps/lubbock-flash.c
@@ -15,10 +15,13 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/slab.h>
+
 #include <linux/dma-mapping.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
+
 #include <asm/io.h>
 #include <asm/hardware.h>
 #include <asm/arch/pxa-regs.h>
diff --git a/drivers/mtd/maps/mainstone-flash.c b/drivers/mtd/maps/mainstone-flash.c
index 87e93fa60588..da0f8a692628 100644
--- a/drivers/mtd/maps/mainstone-flash.c
+++ b/drivers/mtd/maps/mainstone-flash.c
@@ -16,9 +16,12 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
+
 #include <asm/io.h>
 #include <asm/hardware.h>
 #include <asm/arch/pxa-regs.h>
diff --git a/drivers/mtd/maps/omap-toto-flash.c b/drivers/mtd/maps/omap-toto-flash.c
index 496109071cb1..da36e8dddd17 100644
--- a/drivers/mtd/maps/omap-toto-flash.c
+++ b/drivers/mtd/maps/omap-toto-flash.c
@@ -12,9 +12,9 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c
index b17bca657daf..fa84566245a7 100644
--- a/drivers/mtd/maps/omap_nor.c
+++ b/drivers/mtd/maps/omap_nor.c
@@ -36,6 +36,8 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
+#include <linux/slab.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/maps/pci.c b/drivers/mtd/maps/pci.c
index 18dbd3af1eaa..d9c64e99ee32 100644
--- a/drivers/mtd/maps/pci.c
+++ b/drivers/mtd/maps/pci.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/plat-ram.c b/drivers/mtd/maps/plat-ram.c
index 118b04544cad..a0577ea00c3c 100644
--- a/drivers/mtd/maps/plat-ram.c
+++ b/drivers/mtd/maps/plat-ram.c
@@ -30,6 +30,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/tqm8xxl.c b/drivers/mtd/maps/tqm8xxl.c
index 995e9991cb8d..4e28b977f224 100644
--- a/drivers/mtd/maps/tqm8xxl.c
+++ b/drivers/mtd/maps/tqm8xxl.c
@@ -27,12 +27,14 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
+#include <asm/io.h>
+
 #define FLASH_ADDR 0x40000000
 #define FLASH_SIZE 0x00800000
 #define FLASH_BANK_MAX 4
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index b7c32c242bc7..400dd9c89883 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/sched.h>	/* TASK_* */
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/blktrans.h>
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index c534fd5d95cb..16df1e4fb0e9 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/sched.h>	/* TASK_* */
 #include <asm/uaccess.h>
 
 #include <linux/device.h>
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 8f66d093c80d..f3e65af33a9c 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -14,7 +14,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-
+#include <linux/sched.h>	/* TASK_* */
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/concat.h>
 
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index b47ebcb31e0f..b58ba236a9eb 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -51,6 +51,7 @@
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
diff --git a/drivers/pci/hotplug/cpcihp_generic.c b/drivers/pci/hotplug/cpcihp_generic.c
index a62a4345b466..2d4639d6841f 100644
--- a/drivers/pci/hotplug/cpcihp_generic.c
+++ b/drivers/pci/hotplug/cpcihp_generic.c
@@ -39,6 +39,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
+#include <linux/string.h>
 #include "cpci_hotplug.h"
 
 #define DRIVER_VERSION	"0.1"
diff --git a/drivers/pci/hotplug/cpcihp_zt5550.c b/drivers/pci/hotplug/cpcihp_zt5550.c
index 790abadd816c..f7cb00da38df 100644
--- a/drivers/pci/hotplug/cpcihp_zt5550.c
+++ b/drivers/pci/hotplug/cpcihp_zt5550.c
@@ -36,6 +36,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
+#include <linux/signal.h>	/* SA_SHIRQ */
 #include "cpci_hotplug.h"
 #include "cpcihp_zt5550.h"
 
diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index 8e47fa66e25e..060d74775d7b 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -37,6 +37,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "pci_hotplug.h"
 #include "../pci.h"
 
diff --git a/drivers/pci/hotplug/pciehprm_nonacpi.c b/drivers/pci/hotplug/pciehprm_nonacpi.c
index 3622965f8961..33b2c69a0829 100644
--- a/drivers/pci/hotplug/pciehprm_nonacpi.c
+++ b/drivers/pci/hotplug/pciehprm_nonacpi.c
@@ -33,10 +33,13 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/slab.h>
+
 #include <asm/uaccess.h>
 #ifdef CONFIG_IA64
 #include <asm/iosapic.h>
 #endif
+
 #include "pciehp.h"
 #include "pciehprm.h"
 #include "pciehprm_nonacpi.h"
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index ad1017da8656..fcb66b9a0e28 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -16,10 +16,13 @@
  */
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/string.h>
+
 #include <asm/pci-bridge.h>
 #include <asm/semaphore.h>
 #include <asm/rtas.h>
 #include <asm/vio.h>
+
 #include "../pci.h"
 #include "rpaphp.h"
 #include "rpadlpar.h"
diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c
index 46c157d26a2f..f7c12d7dfcfc 100644
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -23,11 +23,13 @@
  *
  */
 #include <linux/pci.h>
+#include <linux/string.h>
+
 #include <asm/pci-bridge.h>
 #include <asm/rtas.h>
 #include <asm/machdep.h>
-#include "../pci.h"		/* for pci_add_new_bus */
 
+#include "../pci.h"		/* for pci_add_new_bus */
 #include "rpaphp.h"
 
 static struct pci_bus *find_bus_among_children(struct pci_bus *bus,
diff --git a/drivers/pci/hotplug/rpaphp_slot.c b/drivers/pci/hotplug/rpaphp_slot.c
index 0e8815495083..daa89ae57123 100644
--- a/drivers/pci/hotplug/rpaphp_slot.c
+++ b/drivers/pci/hotplug/rpaphp_slot.c
@@ -27,6 +27,9 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <asm/rtas.h>
 #include "rpaphp.h"
 
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index abe2cf411e68..08ad26a0cae7 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -32,6 +32,8 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>	/* signal_pending(), struct timer_list */
+
 #include "pci_hotplug.h"
 
 #if !defined(MODULE)
diff --git a/drivers/pci/hotplug/shpchprm_nonacpi.c b/drivers/pci/hotplug/shpchprm_nonacpi.c
index d70fe5408417..c6b40998eeb3 100644
--- a/drivers/pci/hotplug/shpchprm_nonacpi.c
+++ b/drivers/pci/hotplug/shpchprm_nonacpi.c
@@ -32,6 +32,8 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
+
 #include "shpchp.h"
 
 int shpchprm_get_physical_slot_number(struct controller *ctrl, u32 *sun, u8 busnum, u8 devnum)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 8972e6a3aac0..ae986e590b48 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -8,6 +8,8 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/mempolicy.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "pci.h"
 
 /*
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 61b855c99e39..e74d75843047 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -15,6 +15,7 @@
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/string.h>
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 14f05d22bb70..467a4ceccf10 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -11,6 +11,8 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
 #include "portdrv.h"
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 3c565ce7f77b..02260141dc81 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/pm.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
 #include "portdrv.h"
diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index 49bd21702314..598a115cd00e 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -9,6 +9,7 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 
 #include "pci.h"
 
diff --git a/drivers/pnp/manager.c b/drivers/pnp/manager.c
index 94442ffd4aed..cbb2749db178 100644
--- a/drivers/pnp/manager.c
+++ b/drivers/pnp/manager.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/pnp.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
 #include "base.h"
 
 DECLARE_MUTEX(pnp_res_mutex);
diff --git a/drivers/pnp/pnpbios/rsparser.c b/drivers/pnp/pnpbios/rsparser.c
index b0ca65b68645..5e38cd7335f7 100644
--- a/drivers/pnp/pnpbios/rsparser.c
+++ b/drivers/pnp/pnpbios/rsparser.c
@@ -7,6 +7,8 @@
 #include <linux/ctype.h>
 #include <linux/pnp.h>
 #include <linux/pnpbios.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 8cc4f1a940dc..c05b069c2996 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -30,10 +30,13 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/timex.h>	/* get_clock() */
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
 #include <asm/cmb.h>
+#include <asm/div64.h>
 
 #include "cio.h"
 #include "css.h"
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 1c2659766c09..811c9d150637 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -22,6 +22,7 @@
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
+#include <asm/param.h>		/* HZ */
 
 #include "cio.h"
 #include "css.h"
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index fbe4202a3f6f..c1c89f4fd4e3 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -11,6 +11,8 @@
 #include <linux/module.h>
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 771e97ef136e..b856e140e65f 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -26,6 +26,7 @@
  */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/sched.h>	/* workqueue stuff, HZ */
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_transport.h>
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 8bb8222ea589..d2caa35059d9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -19,6 +19,9 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index e753ba27dc59..a1a58e1d5ad3 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -37,6 +37,9 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+
+#include <linux/slab.h>
+
 #include "sym_glue.h"
 #include "sym_nvram.h"
 
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.h b/drivers/scsi/sym53c8xx_2/sym_hipd.h
index 3131a6bf7ab7..3a264a408216 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.h
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.h
@@ -37,6 +37,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/gfp.h>
+
 #ifndef SYM_HIPD_H
 #define SYM_HIPD_H
 
diff --git a/drivers/sh/superhyway/superhyway.c b/drivers/sh/superhyway/superhyway.c
index f056276b08a1..28757cb9d246 100644
--- a/drivers/sh/superhyway/superhyway.c
+++ b/drivers/sh/superhyway/superhyway.c
@@ -16,6 +16,8 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/superhyway.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 static int superhyway_devices;
 
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index 45efeed1fcc3..49815ec4b842 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -14,6 +14,9 @@
  * This file is licenced under the GPL.
  */
 
+#include <linux/signal.h>	/* SA_INTERRUPT */
+#include <linux/jiffies.h>
+
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/mach-types.h>
diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c
index bf1d5ab4aa3a..7ce1d9ef0289 100644
--- a/drivers/usb/host/ohci-pci.c
+++ b/drivers/usb/host/ohci-pci.c
@@ -14,6 +14,8 @@
  * This file is licenced under the GPL.
  */
  
+#include <linux/jiffies.h>
+
 #ifdef CONFIG_PPC_PMAC
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
diff --git a/drivers/usb/host/ohci-pxa27x.c b/drivers/usb/host/ohci-pxa27x.c
index d287dcccd415..f4a4aeda40b7 100644
--- a/drivers/usb/host/ohci-pxa27x.c
+++ b/drivers/usb/host/ohci-pxa27x.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/signal.h>
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
 #include <asm/arch/pxa-regs.h>
diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c
index 88c517a4c178..9e293e139a0e 100644
--- a/drivers/w1/w1_family.c
+++ b/drivers/w1/w1_family.c
@@ -21,6 +21,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/sched.h>	/* schedule_timeout() */
 #include <linux/delay.h>
 
 #include "w1_family.h"
diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c
index 04ca8840acf1..87c29d7b6c17 100644
--- a/drivers/zorro/zorro-sysfs.c
+++ b/drivers/zorro/zorro-sysfs.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/zorro.h>
 #include <linux/stat.h>
+#include <linux/string.h>
 
 #include "zorro.h"
 
diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c
index d3c05dfe20d2..0f2b40605b06 100644
--- a/drivers/zorro/zorro.c
+++ b/drivers/zorro/zorro.c
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/zorro.h>
 #include <linux/bitops.h>
+#include <linux/string.h>
+
 #include <asm/setup.h>
 #include <asm/amigahw.h>
 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 44082bfdfec9..9f1072836c8e 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -12,6 +12,7 @@
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/sched.h>	/* for 'current' */
 #include <asm/uaccess.h>
 
 /*
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0f224384f176..8210ac16a368 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
 #include <linux/completion.h>
+#include <linux/sched.h>
 #include "nodelist.h"
 
 
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 996d922e503e..316133c626b7 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -18,6 +18,8 @@
 #include <linux/mtd/mtd.h>
 #include <linux/crc32.h>
 #include <linux/mtd/nand.h>
+#include <linux/jiffies.h>
+
 #include "nodelist.h"
 
 /* For testing write failures */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index ff7f80f48df1..d068176b7ad7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -23,6 +23,7 @@
 #include <linux/completion.h>
 #include <linux/workqueue.h>
 #include <linux/cpumask.h>
+#include <asm/div64.h>
 
 #define CPUFREQ_NAME_LEN 16
 
diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index cd623eccdbea..2401dea2b867 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/device.h>
+#include <linux/timer.h>
 
 struct gameport {
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index f88577ca3b3a..5e19a7ba69b2 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -31,6 +31,7 @@
 #include <linux/i2c-id.h>
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
+#include <linux/sched.h>	/* for completion */
 #include <asm/semaphore.h>
 
 /* --- For i2c-isa ---------------------------------------------------- */
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 92300325dbcd..d79c8a4bc4f8 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -25,10 +25,14 @@
 /* How many different OSM's are we allowing */
 #define I2O_MAX_DRIVERS		8
 
-#include <asm/io.h>
-#include <asm/semaphore.h>	/* Needed for MUTEX init macros */
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>	/* work_struct */
+
+#include <asm/io.h>
+#include <asm/semaphore.h>	/* Needed for MUTEX init macros */
 
 /* message queue empty */
 #define I2O_QUEUE_EMPTY		0xffffffff
diff --git a/include/linux/kobj_map.h b/include/linux/kobj_map.h
index b6cc10bf8dfc..cbe7d8008042 100644
--- a/include/linux/kobj_map.h
+++ b/include/linux/kobj_map.h
@@ -1,5 +1,7 @@
 #ifdef __KERNEL__
 
+#include <asm/semaphore.h>
+
 typedef struct kobject *kobj_probe_t(dev_t, int *, void *);
 struct kobj_map;
 
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 142963f01d29..fc28841f3409 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -8,7 +8,10 @@
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/string.h>
+
 #include <linux/mtd/compatmac.h>
+
 #include <asm/unaligned.h>
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 12cd9cf65e8f..33fc8cb8ddfb 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -11,6 +11,7 @@
 #define _LINUX_SERIAL_H
 
 #ifdef __KERNEL__
+#include <linux/types.h>
 #include <asm/page.h>
 
 /*
diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h
index fc5bb4e91a58..7dac8f04d28e 100644
--- a/include/linux/textsearch.h
+++ b/include/linux/textsearch.h
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 struct ts_config;
 
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index c8592c7e8eaa..e788bbc5657d 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -17,6 +17,7 @@
 
 #include <linux/config.h>
 #include <linux/device.h>
+#include <linux/sched.h>	/* task_struct, completion */
 
 #include <pcmcia/cs_types.h>
 #include <pcmcia/cs.h>
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index e6b61fab66dd..7529f4388bb4 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/list.h>
 #include <linux/types.h>
+#include <linux/timer.h>
 
 struct request;
 struct scatterlist;
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index b0d445437372..c04405bead2d 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -28,6 +28,7 @@
 #define SCSI_TRANSPORT_FC_H
 
 #include <linux/config.h>
+#include <linux/sched.h>
 
 struct scsi_transport_template;
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b57..39277dd6bf90 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
 
 #include <asm/sections.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f3ea492ab44d..ce4915dd683a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <asm-generic/sections.h>
diff --git a/kernel/params.c b/kernel/params.c
index 1a8614bac5d5..47ba69547945 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/lib/kobject.c b/lib/kobject.c
index 253d3004ace9..a181abed89f6 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 
 /**
  *	populate_dir - populate directory with attributes.
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 42c08ef828c5..eddc9b3d3876 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -5,6 +5,7 @@
  */
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/sched.h>
 
 unsigned int debug_smp_processor_id(void)
 {
diff --git a/lib/sort.c b/lib/sort.c
index ddc4d35df289..5f3b51ffa1dc 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/slab.h>
 
 static void u32_swap(void *a, void *b, int size)
 {
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index e4e9031dd9c3..b07db5ca3f66 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -23,6 +23,7 @@
 #include <linux/ctype.h>
 #include <linux/kernel.h>
 
+#include <asm/page.h>		/* for PAGE_SIZE */
 #include <asm/div64.h>
 
 /**
diff --git a/sound/oss/ac97_codec.c b/sound/oss/ac97_codec.c
index 3ecef4689f1b..fd25aca25120 100644
--- a/sound/oss/ac97_codec.c
+++ b/sound/oss/ac97_codec.c
@@ -55,6 +55,7 @@
 #include <linux/pci.h>
 #include <linux/ac97_codec.h>
 #include <asm/uaccess.h>
+#include <asm/semaphore.h>
 
 #define CODEC_ID_BUFSZ 14
 
-- 
cgit v1.2.3