From 644f6741bec0455fa7d8c780034cc8f01988c3e0 Mon Sep 17 00:00:00 2001
From: "Prasanna S. Panchamukhi" <prasanna@in.ibm.com>
Date: Sat, 2 Oct 2004 19:14:25 -0700
Subject: [PATCH] kprobes exception notifier fix

This patch modifies the return value of kprobes exceptions notify handler.
The kprobes exception notifier returns NOTIFY_STOP on handling
notification.  This patch helps other debuggers to co-exists with the
Kprobes.  Other debuggers registered for exceptions notification must
return NOTIFY_STOP on handling the notification.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 01436a31c690..ca4e28b4c6a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -25,6 +25,8 @@
  *		hlists and exceptions notifier as suggested by Andi Kleen.
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ *		exceptions notifier to be first on the priority list.
  */
 #include <linux/kprobes.h>
 #include <linux/spinlock.h>
@@ -108,6 +110,7 @@ void unregister_kprobe(struct kprobe *p)
 
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
+	.priority = 0x7fffffff /* we need to notified first */
 };
 
 int register_jprobe(struct jprobe *jp)
-- 
cgit v1.2.3


From 6ff6ffce03e813108d24b2c8afa4a95f8f24e227 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 2 Oct 2004 19:15:00 -0700
Subject: [PATCH] sparc64: time interpolator build fix

We need io.h for readq().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 36e40d8b182d..0ed4e5e13939 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -36,6 +36,7 @@
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
+#include <asm/io.h>
 
 #ifdef CONFIG_TIME_INTERPOLATION
 static void time_interpolator_update(long delta_nsec);
-- 
cgit v1.2.3


From e537c55cc06b25760de8ab3cf9db65a17a7714a2 Mon Sep 17 00:00:00 2001
From: Stefan Seyfried <seife@suse.de>
Date: Sat, 2 Oct 2004 19:15:12 -0700
Subject: [PATCH] swsusp: fix highmem

From: Pavel Machek <pavel@ucw.cz>

This actually calls highmem_resume(), so swsusp has chance to work on
highmem machines.  It also adds comments about code flow, which is quite
interesting at that point.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f65d295478c4..1c49df18d532 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -856,7 +856,9 @@ int swsusp_suspend(void)
 	local_irq_disable();
 	save_processor_state();
 	error = swsusp_arch_suspend();
+	/* Restore control flow magically appears here */
 	restore_processor_state();
+	restore_highmem();
 	local_irq_enable();
 	return error;
 }
@@ -876,8 +878,13 @@ int swsusp_resume(void)
 {
 	int error;
 	local_irq_disable();
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = swsusp_arch_resume();
+	/* Code below is only ever reached in case of failure. Otherwise
+	 * execution continues at place where swsusp_arch_suspend was called
+         */
+	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
 	local_irq_enable();
-- 
cgit v1.2.3


From a1147f6b3f5e4e1f19fb3949e35b06accb81f773 Mon Sep 17 00:00:00 2001
From: Maximilian Attems <janitor@sternwelten.at>
Date: Sat, 2 Oct 2004 19:18:59 -0700
Subject: [PATCH] msleep_interruptible(): fix whitespace

thanks Xu for noticing, some whitespace found it's way there.
clean that up.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 0ed4e5e13939..15182647509d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1625,13 +1625,13 @@ EXPORT_SYMBOL(msleep);
  */
 unsigned long msleep_interruptible(unsigned int msecs)
 {
-       unsigned long timeout = msecs_to_jiffies(msecs);
+	unsigned long timeout = msecs_to_jiffies(msecs);
 
-       while (timeout && !signal_pending(current)) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               timeout = schedule_timeout(timeout);
-       }
-       return jiffies_to_msecs(timeout);
+	while (timeout && !signal_pending(current)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		timeout = schedule_timeout(timeout);
+	}
+	return jiffies_to_msecs(timeout);
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
-- 
cgit v1.2.3


From 2866792071f8c7a8ab5cf6818835ed7243dafeae Mon Sep 17 00:00:00 2001
From: "Josef \\'Jeff\\' Sipek" <jeffpc@optonline.net>
Date: Sat, 2 Oct 2004 19:51:31 -0700
Subject: [PATCH] Add DEVPATH env variable to hotplug helper call

Add $DEVPATH to the environmental variables during /sbin/hotplug call.

Signed-off-by: Josef 'Jeff' Sipek <jeffpc@optonline.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 083521327e64..64b8ed3cc8ea 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -61,13 +61,13 @@ static inline void check_for_tasks(int cpu)
  * cpu' with certain environment variables set.  */
 static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
 {
-	char *argv[3], *envp[5], cpu_str[12], action_str[32];
+	char *argv[3], *envp[6], cpu_str[12], action_str[32], devpath_str[40];
 	int i;
 
 	sprintf(cpu_str, "CPU=%d", cpu);
 	sprintf(action_str, "ACTION=%s", action);
-	/* FIXME: Add DEVPATH. --RR */
-
+	sprintf(devpath_str, "DEVPATH=devices/system/cpu/cpu%d", cpu);
+	
 	i = 0;
 	argv[i++] = hotplug_path;
 	argv[i++] = "cpu";
@@ -79,6 +79,7 @@ static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i++] = cpu_str;
 	envp[i++] = action_str;
+	envp[i++] = devpath_str;
 	envp[i] = NULL;
 
 	return call_usermodehelper(argv[0], argv, envp, 0);
-- 
cgit v1.2.3


From 00a87a388cd7be2261d2bcc6ecc045d8681a5138 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@redhat.com>
Date: Tue, 5 Oct 2004 02:44:44 -0700
Subject: [PATCH] Fix task_hot() balancing

This fixes the integer underflow in task_hot() noticed by Kenneth W Chen
and makes use of p->last_ran to separate load-balancing timestamps (used
by task_hot()) from interactivity timestamps.  (which two interfered)

compiled, booted on x86 SMP.

Confirmed by Kenneth Chen <kenneth.w.chen@intel.com> to fix the db
transaction processing workload that showed the balancing problem.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5d4b9cae523..90f5cb645116 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -449,7 +449,7 @@ struct task_struct {
 
 	unsigned long sleep_avg;
 	long interactive_credit;
-	unsigned long long timestamp;
+	unsigned long long timestamp, last_ran;
 	int activated;
 
 	unsigned long policy;
diff --git a/kernel/sched.c b/kernel/sched.c
index 9b395e8d616d..b1643e495f1f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -180,7 +180,8 @@ static unsigned int task_timeslice(task_t *p)
 	else
 		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 }
-#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
+#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
+				< (long long) (sd)->cache_hot_time)
 
 enum idle_type
 {
@@ -2764,7 +2765,7 @@ switch_tasks:
 		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
 			prev->interactive_credit--;
 	}
-	prev->timestamp = now;
+	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
-- 
cgit v1.2.3


From 70d88abf9994ff042b8c715de069ed92fce16cf1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 7 Oct 2004 17:32:37 -0700
Subject: [PATCH] Use cache_decay_ticks instead of a constant

This patch, based on Ken Chen's patch, changes the default 2.5 msec
migration-cutoff value to be based on cache_decay_ticks, which is what
we used for a long time prior the sched-domains code.  (If an
architecture does not set cache_decay_ticks then 2.5 msec is used.)

This causes the following new migration-cutoff values on various SMP
systems:

 x86, variable:

 2-way celeron 466MHz, 128K:        2.5 msec ->  1.0 msec
 2-way/4-way 2.2 GHz P4 Xeon 1MB:   2.5 msec ->  2.0 msec
 8-way P3 700 MHz Xeon 2MB:         2.5 msec ->  6.0 msec

 x64, variable:

 amd64: 2.0 GHz, 1MB:               2.5 msec -> 1.5 msec
 em64t: 3.4 GHz, 1MB:               2.5 msec -> 3.0 msec

 ppc64 [*]:                         2.5 msec -> 2.5 msec (constant)

 ia64:                              2.5 msec -> 10.0 msec (constant)

    [*] ppc64 does not set cache_decay_ticks so we fall back to the
        default.

I believe in light of previous testing we could attempt this for 2.6.9 as
well.  (Note: that the 2.6.9-rc3 patch looks similar but needs to patch
kernel/sched.c.  Note2: this patch is different from Ken's original one.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b1643e495f1f..01e232255050 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -388,7 +388,7 @@ struct sched_domain {
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (5*1000000/2),	\
+	.cache_hot_time		= cache_decay_ticks*1000000 ? : (5*1000000/2),\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
-- 
cgit v1.2.3


From b959d48e8272523cb004120e703e5f21c26a3edb Mon Sep 17 00:00:00 2001
From: Ian Campbell <icampbell@arcom.com>
Date: Sat, 9 Oct 2004 01:03:26 -0700
Subject: [PATCH] pm: console driver fixes

Fix warnings in kernel/power/console.c by only declaring orig_fgconsole
and orig_kmsg when required by SUSPEND_CONSOLE. Restore kmsg_redirect on
resume.

Signed-off-by: Ian Campbell <icampbell@arcom.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/console.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/console.c b/kernel/power/console.c
index 00b390d7a5ad..7ff375e7c95f 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -11,7 +11,9 @@
 
 static int new_loglevel = 10;
 static int orig_loglevel;
+#ifdef SUSPEND_CONSOLE
 static int orig_fgconsole, orig_kmsg;
+#endif
 
 int pm_prepare_console(void)
 {
@@ -50,6 +52,7 @@ void pm_restore_console(void)
 	acquire_console_sem();
 	set_console(orig_fgconsole);
 	release_console_sem();
+	kmsg_redirect = orig_kmsg;
 #endif
 	return;
 }
-- 
cgit v1.2.3


From 011a660daec1f656a28179f675eca7b89e2987cb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 13 Oct 2004 07:25:49 -0700
Subject: [PATCH] time interpolator fixes

- Remove the setting of CLOCK_REALTIME and CLOCK_MONOTONIC resolution
  according to the interpolator resolution since this causes periodic timer
  signals to fail.  The clocks will still be high-resolution but the
  "resolution" reported reflects the timer intervals possible via
  timer_settime (also more conformant to what the Single Unix Specification
  says).

- Make the IA64 clock_gettime fastcall fall back on negative clock
  numbers instead of returning CLOCK_REALTIME.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/fsys.S | 4 ++--
 kernel/posix-timers.c   | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 0af6f4b4a289..4895559ee807 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -234,7 +234,7 @@ ENTRY(fsys_gettimeofday)
 	cmp.eq p9,p0 = 1,r8	// MMIO64 ?
 	extr r2 = r21,24,8	// time_interpolator->jitter
 	cmp.eq p10,p0 = 2,r8	// MMIO32 ?
-	cmp.lt p11,p0 = 2,r8	// function?
+	cmp.ltu p11,p0 = 2,r8	// function or other clock
 (p11)	br.cond.spnt.many fsys_fallback_syscall
 	;;
 	setf.sig f7 = r3	// Setup for scaling of counter
@@ -338,7 +338,7 @@ ENTRY(fsys_clock_gettime)
 	.prologue
 	.altrp b6
 	.body
-	cmp4.lt p6, p0 = CLOCK_MONOTONIC, r32
+	cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
 	// Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
 (p6)	br.spnt.few fsys_fallback_syscall
 	mov r31 = r33
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ef5c42101748..7ba8941b834b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -219,11 +219,6 @@ static __init int init_posix_timers(void)
 		.clock_set = do_posix_clock_monotonic_settime
 	};
 
-#ifdef CONFIG_TIME_INTERPOLATION
-	/* Clocks are more accurate with time interpolators */
-	clock_realtime.res = clock_monotonic.res = time_interpolator_resolution();
-#endif
-
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
-- 
cgit v1.2.3


From 511ca0be9d97abfefbce4cde77b2332c9d8762db Mon Sep 17 00:00:00 2001
From: "John L. Byrne" <john.l.byrne@hp.com>
Date: Wed, 13 Oct 2004 07:26:01 -0700
Subject: [PATCH] fix oops in fork() cleanup path

It will oops on an error path if the thread being forked is a process with
a NULL mm.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8c7ba481c9a5..7e73e420441e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1146,7 +1146,8 @@ fork_out:
 bad_fork_cleanup_namespace:
 	exit_namespace(p);
 bad_fork_cleanup_mm:
-	mmput(p->mm);
+	if (p->mm)
+		mmput(p->mm);
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
-- 
cgit v1.2.3


From b9877c907d56b803b5b0241c2465ce768809fce9 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Wed, 13 Oct 2004 07:27:49 -0700
Subject: [PATCH] Fix reporting of process start times

Derive process start times from the posix_clock_monotonic notion of uptime
instead of "jiffies", consistent with the earlier change to /proc/uptime
itself.
(http://linus.bkbits.net:8080/linux-2.5/cset@3ef4851dGg0fxX58R9Zv8SIq9fzNmQ?na%0Av=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)

Process start times are reported to userspace in units of 1/USER_HZ since
boot, thus applications as procps need the value of "uptime" to convert
them into absolute time.

Currently "uptime" is derived from an ntp-corrected time base, but process
start time is derived from the free-running "jiffies" counter.  This
results in inaccurate, drifting process start times as seen by the user,
even if the exported number stays constant, because the users notion of
"jiffies" changes in time.

It's John Stultz's patch anyways, which I only messed up a bit, but since
people started trading signed-off lines on lkml:

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/array.c       |  6 +++++-
 include/linux/acct.h  | 23 +++++++++++++++--------
 include/linux/sched.h |  2 +-
 include/linux/times.h | 20 ++++++++++++++++++++
 kernel/acct.c         | 10 +++++++++-
 kernel/fork.c         |  2 +-
 mm/oom_kill.c         | 19 +++++++++++++------
 7 files changed, 64 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index fc5c7846df32..272908775622 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -360,7 +360,11 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
diff --git a/include/linux/acct.h b/include/linux/acct.h
index b46ce1ac1c6a..a6ab17c49aa1 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -172,17 +172,24 @@ static inline u32 jiffies_to_AHZ(unsigned long x)
 #endif
 }
 
-static inline u64 jiffies_64_to_AHZ(u64 x)
+static inline u64 nsec_to_AHZ(u64 x)
 {
-#if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0
-#if HZ != AHZ
-	do_div(x, HZ / AHZ);
-#endif
-#else
-	x *= TICK_NSEC;
+#if (NSEC_PER_SEC % AHZ) == 0
 	do_div(x, (NSEC_PER_SEC / AHZ));
+#elif (AHZ % 512) == 0
+	x *= AHZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/*
+         * max relative error 5.7e-8 (1.8s per year) for AHZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for AHZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (AHZ/2))
+	                          / AHZ));
 #endif
-       return x;
+	return x;
 }
 
 #endif  /* __KERNEL */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90f5cb645116..8810b551082a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -508,7 +508,7 @@ struct task_struct {
 	struct timer_list real_timer;
 	unsigned long utime, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 /* process credentials */
diff --git a/include/linux/times.h b/include/linux/times.h
index ff00f334ffaa..0c5aa078dad4 100644
--- a/include/linux/times.h
+++ b/include/linux/times.h
@@ -55,6 +55,26 @@ static inline u64 jiffies_64_to_clock_t(u64 x)
 }
 #endif
 
+static inline u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+	x *= USER_HZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/*
+         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
+	                          / USER_HZ));
+#endif
+	return x;
+}
+
 struct tms {
 	clock_t tms_utime;
 	clock_t tms_stime;
diff --git a/kernel/acct.c b/kernel/acct.c
index daf23c4efab4..fb6989a34f6e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -384,6 +384,8 @@ static void do_acct_process(long exitcode, struct file *file)
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,13 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+					+ current->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else
diff --git a/kernel/fork.c b/kernel/fork.c
index 7e73e420441e..70f604c3937b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -992,7 +992,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->io_wait = NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 48f6dde410b3..3868e29e85be 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -41,7 +42,7 @@
  *    of least surprise ... (be careful when you change it)
  */
 
-static unsigned long badness(struct task_struct *p)
+static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 
@@ -56,12 +57,16 @@ static unsigned long badness(struct task_struct *p)
 	points = p->mm->total_vm;
 
 	/*
-	 * CPU time is in seconds and run time is in minutes. There is no
-	 * particular reason for this other than that it turned out to work
-	 * very well in practice.
+	 * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	if (uptime >= p->start_time.tv_sec)
+		run_time = (uptime - p->start_time.tv_sec) >> 10;
+	else
+		run_time = 0;
 
 	s = int_sqrt(cpu_time);
 	if (s)
@@ -111,10 +116,12 @@ static struct task_struct * select_bad_process(void)
 	unsigned long maxpoints = 0;
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
+	struct timespec uptime;
 
+	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
 		if (p->pid) {
-			unsigned long points = badness(p);
+			unsigned long points = badness(p, uptime.tv_sec);
 			if (points > maxpoints) {
 				chosen = p;
 				maxpoints = points;
-- 
cgit v1.2.3


From c5cada670bf08d89870dd43ea25419f6ffb27c33 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 16 Oct 2004 19:20:30 -0700
Subject: [PATCH] tailcall prevention in sys_wait4() and sys_waitid()

A hack to prevent the compiler from generatin tailcalls in these two
functions.

With CONFIG_REGPARM=y, the tailcalled code ends up stomping on the
syscall's argument frame which corrupts userspace's registers.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/linkage.h |  4 ++++
 include/linux/linkage.h    |  4 ++++
 kernel/exit.c              | 16 ++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-i386/linkage.h b/include/asm-i386/linkage.h
index e48009fd93c7..af3d8571c5c7 100644
--- a/include/asm-i386/linkage.h
+++ b/include/asm-i386/linkage.h
@@ -5,6 +5,10 @@
 #define FASTCALL(x)	x __attribute__((regparm(3)))
 #define fastcall	__attribute__((regparm(3)))
 
+#ifdef CONFIG_REGPARM
+# define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret))
+#endif
+
 #ifdef CONFIG_X86_ALIGNMENT_16
 #define __ALIGN .align 16,0x90
 #define __ALIGN_STR ".align 16,0x90"
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 09955c0ce848..338f7795d8a0 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -14,6 +14,10 @@
 #define asmlinkage CPP_ASMLINKAGE
 #endif
 
+#ifndef prevent_tail_call
+# define prevent_tail_call(ret) do { } while (0)
+#endif
+
 #ifndef __ALIGN
 #define __ALIGN		.align 4,0x90
 #define __ALIGN_STR	".align 4,0x90"
diff --git a/kernel/exit.c b/kernel/exit.c
index 6860b509dd11..6ec1f96fa92b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1376,6 +1376,8 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 			   struct siginfo __user *infop, int options,
 			   struct rusage __user *ru)
 {
+	long ret;
+
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
@@ -1398,15 +1400,25 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 		return -EINVAL;
 	}
 
-	return do_wait(pid, options, infop, NULL, ru);
+	ret = do_wait(pid, options, infop, NULL, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
 			  int options, struct rusage __user *ru)
 {
+	long ret;
+
 	if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
-	return do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+	ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 #ifdef __ARCH_WANT_SYS_WAITPID
-- 
cgit v1.2.3


From fa7f7d6442eb677ec4a70632ed61f7fce0f4e56a Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 18 Oct 2004 08:52:31 -0700
Subject: [PATCH] swsusp: progress in percent

swsusp currently has very poor progress indication.  Thanks to Erik Rigtorp
<erik@rigtorp.com>, we have percentages there, so people know how long wait
to expect.  Please apply,

From: Erik Rigtorp <erik@rigtorp.com>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 18 ++++++++++++++----
 kernel/power/swsusp.c | 22 +++++++++++++++-------
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 312aa169c566..4cc81e03085c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -85,10 +85,20 @@ static int in_suspend __nosavedata = 0;
 
 static void free_some_memory(void)
 {
-	printk("Freeing memory: ");
-	while (shrink_all_memory(10000))
-		printk(".");
-	printk("|\n");
+	unsigned int i = 0;
+	unsigned int tmp;
+	unsigned long pages = 0;
+	char *p = "-\\|/";
+
+	printk("Freeing memory...  ");
+	while ((tmp = shrink_all_memory(10000))) {
+		pages += tmp;
+		printk("\b%c", p[i]);
+		i++;
+		if (i > 3)
+			i = 0;
+	}
+	printk("\bdone (%li pages freed)\n", pages);
 }
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1c49df18d532..375299c86c45 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -294,15 +294,19 @@ static int data_write(void)
 {
 	int error = 0;
 	int i;
+	unsigned int mod = nr_copy_pages / 100;
 
-	printk( "Writing data to swap (%d pages): ", nr_copy_pages );
+	if (!mod)
+		mod = 1;
+
+	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
 	for (i = 0; i < nr_copy_pages && !error; i++) {
-		if (!(i%100))
-			printk( "." );
+		if (!(i%mod))
+			printk( "\b\b\b\b%3d%%", i / mod );
 		error = write_page((pagedir_nosave+i)->address,
 					  &((pagedir_nosave+i)->swap_address));
 	}
-	printk(" %d Pages done.\n",i);
+	printk("\b\b\b\bdone\n");
 	return error;
 }
 
@@ -1141,14 +1145,18 @@ static int __init data_read(void)
 	struct pbe * p;
 	int error;
 	int i;
+	int mod = nr_copy_pages / 100;
+
+	if (!mod)
+		mod = 1;
 
 	if ((error = swsusp_pagedir_relocate()))
 		return error;
 
-	printk( "Reading image data (%d pages): ", nr_copy_pages );
+	printk( "Reading image data (%d pages):     ", nr_copy_pages );
 	for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) {
-		if (!(i%100))
-			printk( "." );
+		if (!(i%mod))
+			printk( "\b\b\b\b%3d%%", i / mod );
 		error = bio_read_page(swp_offset(p->swap_address),
 				  (void *)p->address);
 	}
-- 
cgit v1.2.3


From 31180071ee5e6cc6ff4d036d655c556f582f74e4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 18 Oct 2004 08:53:09 -0700
Subject: [PATCH] make rlimit settings per-process instead of per-thread

POSIX specifies that the limit settings provided by getrlimit/setrlimit are
shared by the whole process, not specific to individual threads.  This
patch changes the behavior of those calls to comply with POSIX.

I've moved the struct rlimit array from task_struct to signal_struct, as it
has the correct sharing properties.  (This reduces kernel memory usage per
thread in multithreaded processes by around 100/200 bytes for 32/64
machines respectively.)  I took a fairly minimal approach to the locking
issues with the newly shared struct rlimit array.  It turns out that all
the code that is checking limits really just needs to look at one word at a
time (one rlim_cur field, usually).  It's only the few places like
getrlimit itself (and fork), that require atomicity in accessing a whole
struct rlimit, so I just used a spin lock for them and no locking for most
of the checks.  If it turns out that readers of struct rlimit need more
atomicity where they are now cheap, or less overhead where they are now
atomic (e.g. fork), then seqcount is certainly the right thing to use for
them instead of readers using the spin lock.  Though it's in signal_struct,
I didn't use siglock since the access to rlimits never needs to disable
irqs and doesn't overlap with other siglock uses.  Instead of adding
something new, I overloaded task_lock(task->group_leader) for this; it is
used for other things that are not likely to happen simultaneously with
limit tweaking.  To me that seems preferable to adding a word, but it would
be trivial (and arguably cleaner) to add a separate lock for these users
(or e.g. just use seqlock, which adds two words but is optimal for readers).

Most of the changes here are just the trivial s/->rlim/->signal->rlim/.

I stumbled across what must be a long-standing bug, in reparent_to_init.
It does:
	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
when surely it was intended to be:
	memcpy(current->rlim, init_task.rlim, sizeof(current->rlim));
As rlim is an array, the * in the sizeof expression gets the size of the
first element, so this just changes the first limit (RLIMIT_CPU).  This is
for kernel threads, where it's clear that resetting all the rlimits is what
you want.  With that fixed, the setting of RLIMIT_FSIZE in nfsd is
superfluous since it will now already have been reset to RLIM_INFINITY.

The other subtlety is removing:
	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
in exit_notify, which was to avoid a race signalling during self-reaping
exit.  As the limit is now shared, a dying thread should not change it for
others.  Instead, I avoid that race by checking current->state before the
RLIMIT_CPU check.  (Adding one new conditional in that path is now required
one way or another, since if not for this check there would also be a new
race with self-reaping exit later on clearing current->signal that would
have to be checked for.)

The one loose end left by this patch is with process accounting.
do_acct_process temporarily resets the RLIMIT_FSIZE limit while writing the
accounting record.  I left this as it was, but it is now changing a limit
that might be shared by other threads still running.  I left this in a
dubious state because it seems to me that processing accounting may already
be more generally a dubious state when it comes to NPTL threads.  I would
think you would want one record per process, with aggregate data about all
threads that ever lived in it, not a separate record for each thread.
I don't use process accounting myself, but if anyone is interested in
testing it out I could provide a patch to change it this way.

One final note, this is not 100% to POSIX compliance in regards to rlimits.
POSIX specifies that RLIMIT_CPU refers to a whole process in aggregate, not
to each individual thread.  I will provide patches later on to achieve that
change, assuming this patch goes in first.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/mmap.c                 |  4 ++--
 arch/ia64/kernel/perfmon.c          |  3 ++-
 arch/ia64/kernel/sys_ia64.c         |  2 +-
 arch/ia64/mm/fault.c                |  4 ++--
 arch/ia64/mm/init.c                 |  2 +-
 arch/mips/kernel/irixelf.c          |  2 +-
 arch/mips/kernel/sysirix.c          | 19 ++++++++++++-------
 arch/ppc64/mm/mmap.c                |  4 ++--
 arch/s390/mm/mmap.c                 |  4 ++--
 arch/sparc/kernel/sys_sunos.c       |  2 +-
 arch/sparc64/kernel/binfmt_aout32.c |  6 +++---
 arch/sparc64/kernel/sys_sunos32.c   |  2 +-
 arch/sparc64/solaris/fs.c           | 16 ++++++++--------
 arch/x86_64/ia32/ia32_aout.c        |  6 +++---
 fs/binfmt_aout.c                    | 10 +++++-----
 fs/binfmt_elf.c                     |  2 +-
 fs/binfmt_flat.c                    |  2 +-
 fs/buffer.c                         |  2 +-
 fs/exec.c                           |  4 ++--
 fs/fcntl.c                          |  6 +++---
 fs/nfs/direct.c                     |  2 +-
 fs/nfsd/nfssvc.c                    |  1 -
 fs/open.c                           |  2 +-
 fs/proc/array.c                     |  4 +++-
 include/linux/init_task.h           |  2 +-
 include/linux/mm.h                  |  2 +-
 include/linux/sched.h               | 13 +++++++++++--
 include/linux/security.h            |  2 +-
 ipc/mqueue.c                        |  2 +-
 kernel/acct.c                       |  6 +++---
 kernel/exit.c                       |  4 ++--
 kernel/fork.c                       | 10 +++++++---
 kernel/signal.c                     |  4 ++--
 kernel/sys.c                        | 27 ++++++++++++++++-----------
 kernel/timer.c                      |  5 +++--
 mm/filemap.c                        |  2 +-
 mm/memory.c                         |  2 +-
 mm/mlock.c                          |  6 +++---
 mm/mmap.c                           | 18 +++++++++---------
 mm/mremap.c                         |  4 ++--
 mm/nommu.c                          |  2 +-
 security/selinux/hooks.c            |  6 +++---
 42 files changed, 127 insertions(+), 101 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c
index a6270ee14323..c5e0d0119a1e 100644
--- a/arch/i386/mm/mmap.c
+++ b/arch/i386/mm/mmap.c
@@ -37,7 +37,7 @@
 
 static inline unsigned long mmap_base(struct mm_struct *mm)
 {
-	unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur;
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
@@ -59,7 +59,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	 */
 	if (sysctl_legacy_va_layout ||
 			(current->personality & ADDR_COMPAT_LAYOUT) ||
-			current->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
+			current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 		mm->unmap_area = arch_unmap_area;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 204e42037e70..e86722b7fce7 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2287,7 +2287,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
 	 * 	return -ENOMEM;
 	 */
-	if (size > task->rlim[RLIMIT_MEMLOCK].rlim_cur) return -ENOMEM;
+	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+		return -EAGAIN;
 
 	/*
 	 * We do the easy to undo allocations first.
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 88d8b217f105..73d6773d5609 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -138,7 +138,7 @@ ia64_brk (unsigned long brk)
 		goto out;
 
 	/* Check against rlimit.. */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 		goto out;
 
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index de73e3c91213..9e4b5c2df50e 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -32,8 +32,8 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address)
 	unsigned long grow;
 
 	grow = PAGE_SIZE >> PAGE_SHIFT;
-	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur
-	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur))
+	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
+	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
 		return -ENOMEM;
 	vma->vm_end += PAGE_SIZE;
 	vma->vm_mm->total_vm += grow;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 87a8016f58c5..3bfbb7dc1f31 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -98,7 +98,7 @@ update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
 inline void
 ia64_set_rbs_bot (void)
 {
-	unsigned long stack_size = current->rlim[RLIMIT_STACK].rlim_max & -16;
+	unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
 
 	if (stack_size > MAX_USER_STACK_SIZE)
 		stack_size = MAX_USER_STACK_SIZE;
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index dc9f7924e024..4858adc76fb1 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -1055,7 +1055,7 @@ static int irix_core_dump(long signr, struct pt_regs * regs, struct file *file)
 	struct vm_area_struct *vma;
 	struct elfhdr elf;
 	off_t offset = 0, dataoff;
-	int limit = current->rlim[RLIMIT_CORE].rlim_cur;
+	int limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote = 4;
 	struct memelfnote notes[4];
 	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 392c73853fa6..4ddbeed9d53e 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -128,16 +128,21 @@ asmlinkage int irix_prctl(struct pt_regs *regs)
 		if (value > RLIM_INFINITY)
 			value = RLIM_INFINITY;
 		if (capable(CAP_SYS_ADMIN)) {
-			current->rlim[RLIMIT_STACK].rlim_max =
-				current->rlim[RLIMIT_STACK].rlim_cur = value;
+			task_lock(current->group_leader);
+			current->signal->rlim[RLIMIT_STACK].rlim_max =
+				current->signal->rlim[RLIMIT_STACK].rlim_cur = value;
+			task_unlock(current->group_leader);
 			error = value;
 			break;
 		}
-		if (value > current->rlim[RLIMIT_STACK].rlim_max) {
+		task_lock(current->group_leader);
+		if (value > current->signal->rlim[RLIMIT_STACK].rlim_max) {
 			error = -EINVAL;
+			task_unlock(current->group_leader);
 			break;
 		}
-		current->rlim[RLIMIT_STACK].rlim_cur = value;
+		current->signal->rlim[RLIMIT_STACK].rlim_cur = value;
+		task_unlock(current->group_leader);
 		error = value;
 		break;
 	}
@@ -145,7 +150,7 @@ asmlinkage int irix_prctl(struct pt_regs *regs)
 	case PR_GETSTACKSIZE:
 		printk("irix_prctl[%s:%d]: Wants PR_GETSTACKSIZE\n",
 		       current->comm, current->pid);
-		error = current->rlim[RLIMIT_STACK].rlim_cur;
+		error = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 		break;
 
 	case PR_MAXPPROCS:
@@ -558,7 +563,7 @@ asmlinkage int irix_brk(unsigned long brk)
 	/*
 	 * Check against rlimit and stack..
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (brk - mm->end_code > rlim) {
@@ -2132,7 +2137,7 @@ asmlinkage int irix_ulimit(int cmd, int arg)
 		retval = -EINVAL;
 		goto out;
 #endif
-		retval = current->rlim[RLIMIT_NOFILE].rlim_cur;
+		retval = current->signal->rlim[RLIMIT_NOFILE].rlim_cur;
 		goto out;
 
 	case 5:
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c
index f90dd1f7ab56..fe65f522aff3 100644
--- a/arch/ppc64/mm/mmap.c
+++ b/arch/ppc64/mm/mmap.c
@@ -37,7 +37,7 @@
 
 static inline unsigned long mmap_base(void)
 {
-	unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur;
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
@@ -58,7 +58,7 @@ static inline int mmap_is_legacy(void)
 	if (current->personality & ADDR_COMPAT_LAYOUT)
 		return 1;
 
-	if (current->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
 		return 1;
 
 	return sysctl_legacy_va_layout;
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 1039196ef053..ebe73afa774b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -37,7 +37,7 @@
 
 static inline unsigned long mmap_base(void)
 {
-	unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur;
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
@@ -58,7 +58,7 @@ static inline int mmap_is_legacy(void)
 #endif
 	return sysctl_legacy_va_layout ||
 	    (current->personality & ADDR_COMPAT_LAYOUT) ||
-	    current->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY;
+	    current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY;
 }
 
 /*
diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c
index b248f057618c..ad049c1f374b 100644
--- a/arch/sparc/kernel/sys_sunos.c
+++ b/arch/sparc/kernel/sys_sunos.c
@@ -178,7 +178,7 @@ asmlinkage int sunos_brk(unsigned long brk)
 	 * Check against rlimit and stack..
 	 */
 	retval = -ENOMEM;
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (brk - current->mm->end_code > rlim)
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index 3dbf2660c24d..0cad0a4328ce 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -102,12 +102,12 @@ static int aout32_core_dump(long signr, struct pt_regs *regs, struct file *file)
 /* If the size of the dump file exceeds the rlimit, then see what would happen
    if we wrote the stack, but not the data area.  */
 	if ((dump.u_dsize+dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 
 /* Make sure we have enough room to write the stack and data areas. */
 	if ((dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 
 /* make sure we actually have a data and stack area to dump */
@@ -218,7 +218,7 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
index ace96ee27879..1bf38a5eaff7 100644
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ b/arch/sparc64/kernel/sys_sunos32.c
@@ -142,7 +142,7 @@ asmlinkage int sunos_brk(u32 baddr)
 	}
 	/* Check against rlimit and stack.. */
 	retval = -ENOMEM;
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (brk - current->mm->end_code > rlim)
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
index a237c68acb41..d7c99fa89661 100644
--- a/arch/sparc64/solaris/fs.c
+++ b/arch/sparc64/solaris/fs.c
@@ -600,23 +600,23 @@ asmlinkage int solaris_ulimit(int cmd, int val)
 {
 	switch (cmd) {
 	case 1: /* UL_GETFSIZE - in 512B chunks */
-		return current->rlim[RLIMIT_FSIZE].rlim_cur >> 9;
+		return current->signal->rlim[RLIMIT_FSIZE].rlim_cur >> 9;
 	case 2: /* UL_SETFSIZE */
 		if ((unsigned long)val > (LONG_MAX>>9)) return -ERANGE;
 		val <<= 9;
-		lock_kernel();
-		if (val > current->rlim[RLIMIT_FSIZE].rlim_max) {
+		task_lock(current->group_leader);
+		if (val > current->signal->rlim[RLIMIT_FSIZE].rlim_max) {
 			if (!capable(CAP_SYS_RESOURCE)) {
-				unlock_kernel();
+				task_unlock(current->group_leader);
 				return -EPERM;
 			}
-			current->rlim[RLIMIT_FSIZE].rlim_max = val;
+			current->signal->rlim[RLIMIT_FSIZE].rlim_max = val;
 		}
-		current->rlim[RLIMIT_FSIZE].rlim_cur = val;
-		unlock_kernel();
+		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = val;
+		task_unlock(current->group_leader);
 		return 0;
 	case 3: /* UL_GMEMLIM */
-		return current->rlim[RLIMIT_DATA].rlim_cur;
+		return current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	case 4: /* UL_GDESLIM */
 		return NR_OPEN;
 	}
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 040e56f759e7..5c7a2c77fd21 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -168,12 +168,12 @@ static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
 /* If the size of the dump file exceeds the rlimit, then see what would happen
    if we wrote the stack, but not the data area.  */
 	if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 
 /* Make sure we have enough room to write the stack and data areas. */
 	if ((dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 
 /* make sure we actually have a data and stack area to dump */
@@ -281,7 +281,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 3d99e70d205a..81dad9b3e757 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -118,22 +118,22 @@ static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
    if we wrote the stack, but not the data area.  */
 #ifdef __sparc__
 	if ((dump.u_dsize+dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 #else
 	if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 #endif
 
 /* Make sure we have enough room to write the stack and data areas. */
 #ifdef __sparc__
 	if ((dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 #else
 	if ((dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 #endif
 
@@ -278,7 +278,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5f492733d625..055c9a0e183e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1314,7 +1314,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
 	off_t offset = 0, dataoff;
-	unsigned long limit = current->rlim[RLIMIT_CORE].rlim_cur;
+	unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote;
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 64fea54db0d6..fb440602cfc1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -486,7 +486,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (data_len + bss_len > rlim)
diff --git a/fs/buffer.c b/fs/buffer.c
index 81f31297b81e..01c725823f19 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2232,7 +2232,7 @@ int generic_cont_expand(struct inode *inode, loff_t size)
 	int err;
 
 	err = -EFBIG;
-        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
 		send_sig(SIGXFSZ, current, 0);
 		goto out;
diff --git a/fs/exec.c b/fs/exec.c
index bdf32393b3c5..55f7e05979f2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -377,7 +377,7 @@ int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
 	bprm->p = PAGE_SIZE * i - offset;
 
 	/* Limit stack size to 1GB */
-	stack_base = current->rlim[RLIMIT_STACK].rlim_max;
+	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 	if (stack_base > (1 << 30))
 		stack_base = 1 << 30;
 	stack_base = PAGE_ALIGN(STACK_TOP - stack_base);
@@ -1393,7 +1393,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	current->signal->group_exit_code = exit_code;
 	coredump_wait(mm);
 
-	if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
+	if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
 		goto fail_unlock;
 
 	/*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index eee115d6a224..0287312d03b3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -86,7 +86,7 @@ static int locate_fd(struct files_struct *files,
 	int error;
 
 	error = -EINVAL;
-	if (orig_start >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 repeat:
@@ -105,7 +105,7 @@ repeat:
 	}
 	
 	error = -EMFILE;
-	if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 	error = expand_files(files, newfd);
@@ -161,7 +161,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	if (newfd == oldfd)
 		goto out_unlock;
 	err = -EBADF;
-	if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out_unlock;
 	get_file(file);			/* We are now finished with oldfd */
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 21c3c4d396be..d6862fe4df4b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -545,7 +545,7 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count,
 {
 	ssize_t retval = -EINVAL;
 	loff_t *ppos = &iocb->ki_pos;
-	unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	struct file *file = iocb->ki_filp;
 	struct nfs_open_context *ctx =
 			(struct nfs_open_context *) file->private_data;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 8a06919c7af3..fe03e31f20b9 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -180,7 +180,6 @@ nfsd(struct svc_rqst *rqstp)
 	/* Lock module and set up kernel thread */
 	lock_kernel();
 	daemonize("nfsd");
-	current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 
 	/* After daemonize() this kernel thread shares current->fs
 	 * with the init process. We need to create files with a
diff --git a/fs/open.c b/fs/open.c
index 8e539937ceca..817bef771912 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -852,7 +852,7 @@ repeat:
 	 * N.B. For clone tasks sharing a files structure, this test
 	 * will limit the total number of files that can be opened.
 	 */
-	if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 	/* Do we need to expand the fdset array? */
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 272908775622..d7ee3a838e3f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -313,6 +313,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	struct mm_struct *mm;
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0, cutime = 0, cstime = 0;
+	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 
 	state = *get_task_state(task);
@@ -347,6 +348,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		cmaj_flt = task->signal->cmaj_flt;
 		cutime = task->signal->cutime;
 		cstime = task->signal->cstime;
+		rsslim = task->signal->rlim[RLIMIT_RSS].rlim_cur;
 	}
 	read_unlock(&tasklist_lock);
 
@@ -393,7 +395,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		start_time,
 		vsize,
 		mm ? mm->rss : 0, /* you might want to shift this left 3 */
-		task->rlim[RLIMIT_RSS].rlim_cur,
+	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
 		mm ? mm->start_stack : 0,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9937c8df8d7c..803d8efb1c4a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,6 +50,7 @@
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}}, \
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
+	.rlim		= INIT_RLIMITS,					\
 }
 
 #define INIT_SIGHAND(sighand) {	\
@@ -96,7 +97,6 @@ extern struct group_info init_groups;
 	.cap_inheritable = CAP_INIT_INH_SET,				\
 	.cap_permitted	= CAP_FULL_SET,					\
 	.keep_capabilities = 0,						\
-	.rlim		= INIT_RLIMITS,					\
 	.user		= INIT_USER,					\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65ff5b5e896a..158ee1c501f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -540,7 +540,7 @@ static inline int can_do_mlock(void)
 {
 	if (capable(CAP_IPC_LOCK))
 		return 1;
-	if (current->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+	if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
 		return 1;
 	return 0;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8810b551082a..389a70ebb189 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -312,6 +312,17 @@ struct signal_struct {
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
 };
 
 /*
@@ -518,8 +529,6 @@ struct task_struct {
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
-/* limits */
-	struct rlimit rlim[RLIM_NLIMITS];
 	unsigned short used_math;
 	char comm[16];
 /* file system info */
diff --git a/include/linux/security.h b/include/linux/security.h
index 983d7c2265bc..a1dee9a60587 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -582,7 +582,7 @@ struct swap_info_struct;
  * @task_setrlimit:
  *	Check permission before setting the resource limits of the current
  *	process for @resource to @new_rlim.  The old resource limit values can
- *	be examined by dereferencing (current->rlim + resource).
+ *	be examined by dereferencing (current->signal->rlim + resource).
  *	@resource contains the resource whose limit is being set.
  *	@new_rlim contains the new limits for @resource.
  *	Return 0 if permission is granted.
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index aee2696c30b1..5cfcd4b68cb3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -145,7 +145,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 			spin_lock(&mq_lock);
 			if (u->mq_bytes + mq_bytes < u->mq_bytes ||
 		 	    u->mq_bytes + mq_bytes >
-			    p->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
+			    p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
 				spin_unlock(&mq_lock);
 				goto out_inode;
 			}
diff --git a/kernel/acct.c b/kernel/acct.c
index fb6989a34f6e..52d49907c0f7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,11 +488,11 @@ static void do_acct_process(long exitcode, struct file *file)
 	/*
  	 * Accounting records are not subject to resource limits.
  	 */
-	flim = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	file->f_op->write(file, (char *)&ac,
 			       sizeof(acct_t), &file->f_pos);
-	current->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 	set_fs(fs);
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ec1f96fa92b..e018bf6169d2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -237,7 +237,8 @@ void reparent_to_init(void)
 	/* rt_priority? */
 	/* signals? */
 	security_task_reparent_to_init(current);
-	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+	memcpy(current->signal->rlim, init_task.signal->rlim,
+	       sizeof(current->signal->rlim));
 	atomic_inc(&(INIT_USER->__count));
 	switch_uid(INIT_USER);
 
@@ -761,7 +762,6 @@ static void exit_notify(struct task_struct *tsk)
 	 */
 	tsk->it_virt_value = 0;
 	tsk->it_prof_value = 0;
-	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
 
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 70f604c3937b..6fd57c2d22cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -249,8 +249,8 @@ void __init fork_init(unsigned long mempages)
 	if(max_threads < 20)
 		max_threads = 20;
 
-	init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
-	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -872,6 +872,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 
+	task_lock(current->group_leader);
+	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+	task_unlock(current->group_leader);
+
 	return 0;
 }
 
@@ -941,7 +945,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	retval = -EAGAIN;
 	if (atomic_read(&p->user->processes) >=
-			p->rlim[RLIMIT_NPROC].rlim_cur) {
+			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
 				p->user != &root_user)
 			goto bad_fork_free;
diff --git a/kernel/signal.c b/kernel/signal.c
index df2cd4216838..44c0223cca3e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -269,7 +269,7 @@ static struct sigqueue *__sigqueue_alloc(void)
 	struct sigqueue *q = NULL;
 
 	if (atomic_read(&current->user->sigpending) <
-			current->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			current->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
 	if (q) {
 		INIT_LIST_HEAD(&q->list);
@@ -764,7 +764,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	if (atomic_read(&t->user->sigpending) <
-			t->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
 
 	if (q) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 571bba8c8989..e2b57d6f6038 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -649,7 +649,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
-				current->rlim[RLIMIT_NPROC].rlim_cur &&
+				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != &root_user) {
 		free_uid(new_user);
 		return -EAGAIN;
@@ -1496,9 +1496,13 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	else
-		return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim))
-			? -EFAULT : 0;
+	else {
+		struct rlimit value;
+		task_lock(current->group_leader);
+		value = current->signal->rlim[resource];
+		task_unlock(current->group_leader);
+		return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+	}
 }
 
 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1513,7 +1517,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
 
-	memcpy(&x, current->rlim + resource, sizeof(*rlim));
+	task_lock(current->group_leader);
+	x = current->signal->rlim[resource];
+	task_unlock(current->group_leader);
 	if(x.rlim_cur > 0x7FFFFFFF)
 		x.rlim_cur = 0x7FFFFFFF;
 	if(x.rlim_max > 0x7FFFFFFF)
@@ -1534,21 +1540,20 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 		return -EFAULT;
        if (new_rlim.rlim_cur > new_rlim.rlim_max)
                return -EINVAL;
-	old_rlim = current->rlim + resource;
-	if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
-	     (new_rlim.rlim_max > old_rlim->rlim_max)) &&
+	old_rlim = current->signal->rlim + resource;
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE) {
-		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
-	}
 
 	retval = security_task_setrlimit(resource, &new_rlim);
 	if (retval)
 		return retval;
 
+	task_lock(current->group_leader);
 	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
 	return 0;
 }
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 15182647509d..9616e707154e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -805,12 +805,13 @@ static inline void do_process_times(struct task_struct *p,
 
 	psecs = (p->utime += user);
 	psecs += (p->stime += system);
-	if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) {
+	if (!unlikely(p->state & (TASK_DEAD|TASK_ZOMBIE)) &&
+	    psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
 		/* Send SIGXCPU every second.. */
 		if (!(psecs % HZ))
 			send_sig(SIGXCPU, p, 1);
 		/* and SIGKILL when we go over max.. */
-		if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
+		if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max)
 			send_sig(SIGKILL, p, 1);
 	}
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 272c3e0a6fed..ba5b903ff585 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1804,7 +1804,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
 {
 	struct inode *inode = file->f_mapping->host;
-	unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 
         if (unlikely(*pos < 0))
                 return -EINVAL;
diff --git a/mm/memory.c b/mm/memory.c
index 0a7013a26019..f10dc9bb7fe2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1236,7 +1236,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
 	goto out_truncate;
 
 do_expand:
-	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && offset > limit)
 		goto out_sig;
 	if (offset > inode->i_sb->s_maxbytes)
diff --git a/mm/mlock.c b/mm/mlock.c
index 873245bc9697..662a52109d36 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -115,7 +115,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
 	locked = len >> PAGE_SHIFT;
 	locked += current->mm->locked_vm;
 
-	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 
 	/* check against resource limits */
@@ -176,7 +176,7 @@ asmlinkage long sys_mlockall(int flags)
 
 	down_write(&current->mm->mmap_sem);
 
-	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 
 	ret = -ENOMEM;
@@ -211,7 +211,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
 
 	spin_lock(&shmlock_user_lock);
 	locked = size >> PAGE_SHIFT;
-	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 	if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
 		goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index 8ccbb00c208f..7e2f336cfbbf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -136,7 +136,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	}
 
 	/* Check against rlimit.. */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 		goto out;
 
@@ -833,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = mm->locked_vm << PAGE_SHIFT;
-		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += len;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
@@ -905,7 +905,7 @@ munmap_back:
 
 	/* Check against address space limit. */
 	if ((mm->total_vm << PAGE_SHIFT) + len
-	    > current->rlim[RLIMIT_AS].rlim_cur)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;
 
 	if (accountable && (!(flags & MAP_NORESERVE) ||
@@ -1350,9 +1350,9 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
 		return -ENOMEM;
 	}
 	
-	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
+	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
-			current->rlim[RLIMIT_AS].rlim_cur) {
+			current->signal->rlim[RLIMIT_AS].rlim_cur) {
 		anon_vma_unlock(vma);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
@@ -1412,9 +1412,9 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
 		return -ENOMEM;
 	}
 	
-	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
+	if (vma->vm_end - address > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
-			current->rlim[RLIMIT_AS].rlim_cur) {
+			current->signal->rlim[RLIMIT_AS].rlim_cur) {
 		anon_vma_unlock(vma);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
@@ -1760,7 +1760,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (mm->def_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = mm->locked_vm << PAGE_SHIFT;
-		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += len;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
@@ -1779,7 +1779,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Check against address space limits *after* clearing old maps... */
 	if ((mm->total_vm << PAGE_SHIFT) + len
-	    > current->rlim[RLIMIT_AS].rlim_cur)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;
 
 	if (mm->map_count > sysctl_max_map_count)
diff --git a/mm/mremap.c b/mm/mremap.c
index 0b0c0e27ecb5..558830585bac 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -327,7 +327,7 @@ unsigned long do_mremap(unsigned long addr,
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = current->mm->locked_vm << PAGE_SHIFT;
-		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += new_len - old_len;
 		ret = -EAGAIN;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
@@ -335,7 +335,7 @@ unsigned long do_mremap(unsigned long addr,
 	}
 	ret = -ENOMEM;
 	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
-	    > current->rlim[RLIMIT_AS].rlim_cur)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 		goto out;
 
 	if (vma->vm_flags & VM_ACCOUNT) {
diff --git a/mm/nommu.c b/mm/nommu.c
index dd4b66b8c9a6..9cac9ae08bb0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,7 +57,7 @@ int vmtruncate(struct inode *inode, loff_t offset)
 	goto out_truncate;
 
 do_expand:
-	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && offset > limit)
 		goto out_sig;
 	if (offset > inode->i_sb->s_maxbytes)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f844e402b53c..349d54dc71c0 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1909,8 +1909,8 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 				  PROCESS__RLIMITINH, NULL, NULL);
 		if (rc) {
 			for (i = 0; i < RLIM_NLIMITS; i++) {
-				rlim = current->rlim + i;
-				initrlim = init_task.rlim+i;
+				rlim = current->signal->rlim + i;
+				initrlim = init_task.signal->rlim+i;
 				rlim->rlim_cur = min(rlim->rlim_max,initrlim->rlim_cur);
 			}
 		}
@@ -2699,7 +2699,7 @@ static int selinux_task_setnice(struct task_struct *p, int nice)
 
 static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
 {
-	struct rlimit *old_rlim = current->rlim + resource;
+	struct rlimit *old_rlim = current->signal->rlim + resource;
 	int rc;
 
 	rc = secondary_ops->task_setrlimit(resource, new_rlim);
-- 
cgit v1.2.3


From 04bff08854d97652f0c31b0b1a577a7e9704c895 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 18 Oct 2004 08:53:22 -0700
Subject: [PATCH] add WCONTINUED support to wait4 syscall

POSIX specifies the new WCONTINUED flag for waitpid, not just for waitid.
I overlooked this addition when I implemented waitid.  The real work was
already done to support waitid, but waitpid needs to report the results

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 79 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index e018bf6169d2..426d3ae722ba 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1228,6 +1228,58 @@ bail_ref:
 	return retval;
 }
 
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(task_t *p, int noreap,
+			       struct siginfo __user *infop,
+			       int __user *stat_addr, struct rusage __user *ru)
+{
+	int retval;
+	pid_t pid;
+	uid_t uid;
+
+	if (unlikely(!p->signal))
+		return 0;
+
+	if (p->signal->stop_state >= 0)
+		return 0;
+
+	spin_lock_irq(&p->sighand->siglock);
+	if (p->signal->stop_state >= 0) { /* Re-check with the lock held.  */
+		spin_unlock_irq(&p->sighand->siglock);
+		return 0;
+	}
+	if (!noreap)
+		p->signal->stop_state = 0;
+	spin_unlock_irq(&p->sighand->siglock);
+
+	pid = p->pid;
+	uid = p->uid;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (!infop) {
+		retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+		put_task_struct(p);
+		if (!retval && stat_addr)
+			retval = put_user(0xffff, stat_addr);
+		if (!retval)
+			retval = p->pid;
+	} else {
+		retval = wait_noreap_copyout(p, pid, uid,
+					     CLD_CONTINUED, SIGCONT,
+					     infop, ru);
+		BUG_ON(retval == 0);
+	}
+
+	return retval;
+}
+
+
 static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
 		    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1290,27 +1342,11 @@ repeat:
 check_continued:
 				if (!unlikely(options & WCONTINUED))
 					continue;
-				if (unlikely(!p->signal))
-					continue;
-				spin_lock_irq(&p->sighand->siglock);
-				if (p->signal->stop_state < 0) {
-					pid_t pid;
-					uid_t uid;
-
-					if (!(options & WNOWAIT))
-						p->signal->stop_state = 0;
-					spin_unlock_irq(&p->sighand->siglock);
-					pid = p->pid;
-					uid = p->uid;
-					get_task_struct(p);
-					read_unlock(&tasklist_lock);
-					retval = wait_noreap_copyout(p, pid,
-							uid, CLD_CONTINUED,
-							SIGCONT, infop, ru);
-					BUG_ON(retval == 0);
+				retval = wait_task_continued(
+					p, (options & WNOWAIT),
+					infop, stat_addr, ru);
+				if (retval != 0) /* He released the lock.  */
 					goto end;
-				}
-				spin_unlock_irq(&p->sighand->siglock);
 				break;
 			}
 		}
@@ -1412,7 +1448,8 @@ asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
 {
 	long ret;
 
-	if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
 	ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
 
-- 
cgit v1.2.3


From cfc4f957ff7bffa6aaf7e4029ee9c8689b43c366 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 18 Oct 2004 08:53:35 -0700
Subject: [PATCH] fix PTRACE_ATTACH race with real parent's wait calls

There is a race between PTRACE_ATTACH and the real parent calling wait.
For a moment, the task is put in PT_PTRACED but with its parent still
pointing to its real_parent.  In this circumstance, if the real parent
calls wait without the WUNTRACED flag, he can see a stopped child status,
which wait should never return without WUNTRACED when the caller is not
using ptrace.  Here it is not the caller that is using ptrace, but some
third party.

This patch avoids this race condition by adding the PT_ATTACHED flag to
distinguish a real parent from a ptrace_attach parent when PT_PTRACED is
set, and then having wait use this flag to confirm that things are in order
and not consider the child ptraced when its ->ptrace flags are set but its
parent links have not yet been switched.  (ptrace_check_attach also uses it
similarly to rule out a possible race with a bogus ptrace call by the real
parent during ptrace_attach.)

While looking into this, I noticed that every arch's sys_execve has:

		current->ptrace &= ~PT_DTRACE;

with no locking at all.  So, if an exec happens in a race with
PTRACE_ATTACH, you could wind up with ->ptrace not having PT_PTRACED set
because this store clobbered it.  That will cause later BUG hits because
the parent links indicate ptracedness but the flag is not set.  The patch
corrects all the places I found to use task_lock around diddling ->ptrace
when it's possible to be racing with ptrace_attach.  (The ptrace operation
code itself doesn't have this issue because it already excludes anyone else
being in ptrace_attach.)

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c        |  2 ++
 arch/m32r/kernel/process.c        |  5 ++++-
 arch/parisc/hpux/fs.c             |  5 ++++-
 arch/parisc/kernel/process.c      |  5 ++++-
 arch/parisc/kernel/sys_parisc32.c |  5 ++++-
 arch/ppc/kernel/process.c         |  5 ++++-
 arch/ppc64/kernel/process.c       |  5 ++++-
 arch/ppc64/kernel/sys_ppc32.c     |  5 ++++-
 arch/s390/kernel/compat_linux.c   |  2 ++
 arch/s390/kernel/process.c        |  2 ++
 arch/sh/kernel/process.c          |  5 ++++-
 arch/sh64/kernel/process.c        |  5 ++++-
 arch/sparc/kernel/process.c       |  5 ++++-
 arch/sparc64/kernel/process.c     |  2 ++
 arch/sparc64/kernel/sys_sparc32.c |  2 ++
 arch/um/kernel/exec_kern.c        |  2 ++
 arch/x86_64/ia32/sys_ia32.c       |  5 ++++-
 arch/x86_64/kernel/process.c      |  5 ++++-
 include/linux/ptrace.h            |  1 +
 kernel/exit.c                     | 20 ++++++++++++++++++--
 kernel/ptrace.c                   |  5 +++--
 21 files changed, 82 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 0095fa1dda6f..45df20c2e77d 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -656,7 +656,9 @@ asmlinkage int sys_execve(struct pt_regs regs)
 			(char __user * __user *) regs.edx,
 			&regs);
 	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
 		/* Make sure we don't return using sysenter.. */
 		set_thread_flag(TIF_IRET);
 	}
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 9e7de27a8e0d..c301e0e61173 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -335,8 +335,11 @@ asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv, c
 		goto out;
 
 	error = do_execve(filename, uargv, uenvp, &regs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 out:
 	return error;
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 0800eb3eade6..d7c80edf4489 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -43,8 +43,11 @@ int hpux_execve(struct pt_regs *regs)
 	error = do_execve(filename, (char **) regs->gr[25],
 		(char **)regs->gr[24], regs);
 
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 
 out:
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index d7365b958f7e..320fca55fa1a 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -363,8 +363,11 @@ asmlinkage int sys_execve(struct pt_regs *regs)
 		goto out;
 	error = do_execve(filename, (char **) regs->gr[25],
 		(char **) regs->gr[24], regs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 out:
 
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index e78332541b18..2b42313c1017 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -80,8 +80,11 @@ asmlinkage int sys32_execve(struct pt_regs *regs)
 		goto out;
 	error = compat_do_execve(filename, compat_ptr(regs->gr[25]),
 				 compat_ptr(regs->gr[24]), regs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 out:
 
diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c
index d9ab6a7de95c..94298ebecabe 100644
--- a/arch/ppc/kernel/process.c
+++ b/arch/ppc/kernel/process.c
@@ -598,8 +598,11 @@ int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
 	preempt_enable();
 	error = do_execve(filename, (char __user *__user *) a1,
 			  (char __user *__user *) a2, regs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 out:
 	return error;
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
index 8211337074cb..1529ba6db632 100644
--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -512,8 +512,11 @@ int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
 	error = do_execve(filename, (char __user * __user *) a1,
 				    (char __user * __user *) a2, regs);
   
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 
 out:
diff --git a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
index 710b7cd9ec47..6a514b3d977c 100644
--- a/arch/ppc64/kernel/sys_ppc32.c
+++ b/arch/ppc64/kernel/sys_ppc32.c
@@ -621,8 +621,11 @@ long sys32_execve(unsigned long a0, unsigned long a1, unsigned long a2,
 
 	error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs);
 
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 
 out:
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 5c0a63aff939..01f57183d9ba 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -751,7 +751,9 @@ sys32_execve(struct pt_regs regs)
 				 compat_ptr(regs.gprs[4]), &regs);
 	if (error == 0)
 	{
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
 		current->thread.fp_regs.fpc=0;
 		__asm__ __volatile__
 		        ("sr  0,0\n\t"
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 5d56e77c74ee..566b34f4c394 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -340,7 +340,9 @@ asmlinkage long sys_execve(struct pt_regs regs)
         error = do_execve(filename, (char __user * __user *) regs.gprs[3],
 			  (char __user * __user *) regs.gprs[4], &regs);
 	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
 		current->thread.fp_regs.fpc = 0;
 		if (MACHINE_HAS_IEEE)
 			asm volatile("sfpc %0,%0" : : "d" (0));
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index c9a43c8df32f..11019fa5ab86 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -481,8 +481,11 @@ asmlinkage int sys_execve(char *ufilename, char **uargv,
 			  (char __user * __user *)uargv,
 			  (char __user * __user *)uenvp,
 			  &regs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 out:
 	return error;
diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c
index 13cec35796ab..68f764389781 100644
--- a/arch/sh64/kernel/process.c
+++ b/arch/sh64/kernel/process.c
@@ -862,8 +862,11 @@ asmlinkage int sys_execve(char *ufilename, char **uargv,
 			  (char __user * __user *)uargv,
 			  (char __user * __user *)uenvp,
 			  pregs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 out:
 	unlock_kernel();
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index 1dc918135eb3..f32eaf5b6256 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -670,8 +670,11 @@ asmlinkage int sparc_execve(struct pt_regs *regs)
 			  (char __user * __user *)regs->u_regs[base + UREG_I2],
 			  regs);
 	putname(filename);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 out:
 	return error;
 }
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index f3e3c657e9cb..56fecbdae8bc 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -829,7 +829,9 @@ asmlinkage int sparc_execve(struct pt_regs *regs)
 		current_thread_info()->xfsr[0] = 0;
 		current_thread_info()->fpsaved[0] = 0;
 		regs->tstate &= ~TSTATE_PEF;
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
 	}
 out:
 	return error;
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index b81f15521e86..28014dc74194 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -1274,7 +1274,9 @@ asmlinkage long sparc32_execve(struct pt_regs *regs)
 		current_thread_info()->xfsr[0] = 0;
 		current_thread_info()->fpsaved[0] = 0;
 		regs->tstate &= ~TSTATE_PEF;
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
 	}
 out:
 	return error;
diff --git a/arch/um/kernel/exec_kern.c b/arch/um/kernel/exec_kern.c
index a5f21283d2cd..f6c84475bb83 100644
--- a/arch/um/kernel/exec_kern.c
+++ b/arch/um/kernel/exec_kern.c
@@ -43,7 +43,9 @@ static int execve1(char *file, char **argv, char **env)
 #endif
         error = do_execve(file, argv, env, &current->thread.regs);
         if (error == 0){
+		task_lock(current);
                 current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
                 set_cmdline(current_cmd());
         }
         return(error);
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 50465345140a..a854fb32963a 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -1135,8 +1135,11 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
 	if (IS_ERR(filename))
 		return error;
 	error = compat_do_execve(filename, argv, envp, regs);
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 	return error;
 }
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 6e835be5f26a..73dfb27a5352 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -542,8 +542,11 @@ long sys_execve(char __user *name, char __user * __user *argv,
 	if (IS_ERR(filename)) 
 		return error;
 	error = do_execve(filename, argv, envp, &regs); 
-	if (error == 0)
+	if (error == 0) {
+		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
+		task_unlock(current);
+	}
 	putname(filename);
 	return error;
 }
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 53132cd80429..f18247125091 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -63,6 +63,7 @@
 #define PT_TRACE_EXEC	0x00000080
 #define PT_TRACE_VFORK_DONE	0x00000100
 #define PT_TRACE_EXIT	0x00000200
+#define PT_ATTACHED	0x00000400	/* parent != real_parent */
 
 #define PT_TRACE_MASK	0x000003f4
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 426d3ae722ba..ca9a9e21c444 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1280,6 +1280,22 @@ static int wait_task_continued(task_t *p, int noreap,
 }
 
 
+static inline int my_ptrace_child(struct task_struct *p)
+{
+	if (!(p->ptrace & PT_PTRACED))
+		return 0;
+	if (!(p->ptrace & PT_ATTACHED))
+		return 1;
+	/*
+	 * This child was PTRACE_ATTACH'd.  We should be seeing it only if
+	 * we are the attacher.  If we are the real parent, this is a race
+	 * inside ptrace_attach.  It is waiting for the tasklist_lock,
+	 * which we have to switch the parent links, but has already set
+	 * the flags in p->ptrace.
+	 */
+	return (p->parent != p->real_parent);
+}
+
 static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
 		    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1308,12 +1324,12 @@ repeat:
 
 			switch (p->state) {
 			case TASK_TRACED:
-				if (!(p->ptrace & PT_PTRACED))
+				if (!my_ptrace_child(p))
 					continue;
 				/*FALLTHROUGH*/
 			case TASK_STOPPED:
 				if (!(options & WUNTRACED) &&
-				    !(p->ptrace & PT_PTRACED))
+				    !my_ptrace_child(p))
 					continue;
 				retval = wait_task_stopped(p, ret == 2,
 							   (options & WNOWAIT),
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b14b4a467729..09ba057222c3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -82,7 +82,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	 */
 	read_lock(&tasklist_lock);
 	if ((child->ptrace & PT_PTRACED) && child->parent == current &&
-	    child->signal != NULL) {
+	    (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
+	    && child->signal != NULL) {
 		ret = 0;
 		spin_lock_irq(&child->sighand->siglock);
 		if (child->state == TASK_STOPPED) {
@@ -131,7 +132,7 @@ int ptrace_attach(struct task_struct *task)
 		goto bad;
 
 	/* Go */
-	task->ptrace |= PT_PTRACED;
+	task->ptrace |= PT_PTRACED | PT_ATTACHED;
 	if (capable(CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 	task_unlock(task);
-- 
cgit v1.2.3


From 40e39ce0f4eceee04555c45ae9918a017cd1686c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 18 Oct 2004 08:53:48 -0700
Subject: [PATCH] softirqs: fix latency of softirq processing

The attached patch fixes a local_bh_enable() buglet: we first enabled
softirqs then did we do local_softirq_pending() - often this is preemptible
code.  So this task could be preempted and there's no guarantee that
softirq processing will occur (except the periodic timer tick).

The race window is small but existent.  This could result in packet
processing latencies or timer expiration latencies - hard to detect and
annoying bugs.

The fix is to invoke softirqs with softirqs enabled but preemption still
disabled.  Patch is against 2.6.9-rc2-mm1.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4a3da9be9f26..0e1e1356e35a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,11 +137,17 @@ EXPORT_SYMBOL(do_softirq);
 
 void local_bh_enable(void)
 {
-	__local_bh_enable();
 	WARN_ON(irqs_disabled());
-	if (unlikely(!in_interrupt() &&
-		     local_softirq_pending()))
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+	preempt_count() -= SOFTIRQ_OFFSET - 1;
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		invoke_softirq();
+
+	dec_preempt_count();
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable);
-- 
cgit v1.2.3


From 09b9135c6e9950c0f12e3e6993ae52ab1baf0476 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 18 Oct 2004 08:54:02 -0700
Subject: [PATCH] add missing linux/syscalls.h includes

I found that the prototypes for sys_waitid and sys_fcntl in
<linux/syscalls.h> don't match the implementation.  In order to keep all
prototypes in sync in the future, now include the header from each file
implementing any syscall.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pci/syscall.c  | 1 +
 fs/aio.c               | 1 +
 fs/buffer.c            | 1 +
 fs/dcache.c            | 1 +
 fs/dcookies.c          | 1 +
 fs/fcntl.c             | 3 ++-
 fs/filesystems.c       | 1 +
 fs/ioctl.c             | 1 +
 fs/locks.c             | 1 +
 fs/namei.c             | 1 +
 fs/namespace.c         | 1 +
 fs/nfsctl.c            | 1 +
 fs/open.c              | 1 +
 fs/quota.c             | 1 +
 fs/read_write.c        | 1 +
 fs/readdir.c           | 1 +
 fs/select.c            | 1 +
 fs/stat.c              | 1 +
 fs/super.c             | 1 +
 fs/xattr.c             | 1 +
 init/do_mounts_devfs.c | 1 -
 ipc/mqueue.c           | 1 +
 ipc/msg.c              | 1 +
 ipc/sem.c              | 1 +
 ipc/shm.c              | 1 +
 kernel/acct.c          | 1 +
 kernel/capability.c    | 1 +
 kernel/exec_domain.c   | 1 +
 kernel/exit.c          | 1 +
 kernel/itimer.c        | 1 +
 kernel/panic.c         | 1 -
 kernel/posix-timers.c  | 1 +
 kernel/printk.c        | 1 +
 kernel/sched.c         | 1 +
 kernel/signal.c        | 1 +
 kernel/sys.c           | 5 +++++
 kernel/sysctl.c        | 1 +
 kernel/time.c          | 1 +
 kernel/timer.c         | 1 +
 mm/fadvise.c           | 1 +
 mm/filemap.c           | 1 +
 mm/fremap.c            | 1 +
 mm/madvise.c           | 1 +
 mm/mincore.c           | 1 +
 mm/mlock.c             | 1 +
 mm/mprotect.c          | 1 +
 mm/mremap.c            | 1 +
 mm/msync.c             | 1 +
 mm/nommu.c             | 1 +
 mm/swapfile.c          | 1 +
 50 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c
index e9654640de4b..913a1e44652a 100644
--- a/drivers/pci/syscall.c
+++ b/drivers/pci/syscall.c
@@ -11,6 +11,7 @@
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/smp_lock.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 
 
diff --git a/fs/aio.c b/fs/aio.c
index 0cd2c6a10e73..23d5f00f2bee 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/aio_abi.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 
 #define DEBUG 0
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 01c725823f19..4ec2acb57946 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -20,6 +20,7 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
+#include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index f938ad50ffc6..ec5668a4136e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 8ecfe5ecb3c3..581aac959cd3 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/list.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0287312d03b3..ee380e7b9569 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
+#include <linux/syscalls.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
@@ -362,7 +363,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
-asmlinkage long sys_fcntl(int fd, unsigned int cmd, unsigned long arg)
+asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
 {	
 	struct file *filp;
 	long err = -EBADF;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 276be78d5f32..af0fd4973880 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -6,6 +6,7 @@
  *  table of configured filesystems
  */
 
+#include <linux/syscalls.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
diff --git a/fs/ioctl.c b/fs/ioctl.c
index c84ce99576a8..88e5b995021a 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/file.h>
diff --git a/fs/locks.c b/fs/locks.c
index 47efea9a624b..efdea3c64a41 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -122,6 +122,7 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
+#include <linux/syscalls.h>
 #include <linux/time.h>
 
 #include <asm/semaphore.h>
diff --git a/fs/namei.c b/fs/namei.c
index 32244e239583..1d8914a76146 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -25,6 +25,7 @@
 #include <linux/smp_lock.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <asm/namei.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index 2c5cc697a18d..961b6ae00458 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index aaf9f5640eb3..0b14938b5b62 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -13,6 +13,7 @@
 #include <linux/linkage.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/fs/open.c b/fs/open.c
index 817bef771912..95266c0bafb5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -22,6 +22,7 @@
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
 
 #include <asm/unistd.h>
 
diff --git a/fs/quota.c b/fs/quota.c
index 3dd9be6b2377..57e4f261e660 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/smp_lock.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 
 /* Check validity of quotactl */
 static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
diff --git a/fs/read_write.c b/fs/read_write.c
index d85431d5d09e..d445440b0f48 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -13,6 +13,7 @@
 #include <linux/dnotify.h>
 #include <linux/security.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/fs/readdir.c b/fs/readdir.c
index fbea474bdb78..de97df7f5e6d 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/dirent.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <linux/unistd.h>
 
 #include <asm/uaccess.h>
diff --git a/fs/select.c b/fs/select.c
index dfff0ad1baf5..e0a87cb4f733 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -14,6 +14,7 @@
  *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
  */
 
+#include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
diff --git a/fs/stat.c b/fs/stat.c
index 6bda9bfae2fb..b8a0e5110ab2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/fs/super.c b/fs/super.c
index 6f8960dbef68..1bcd6eff30a5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>		/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
 #include <linux/idr.h>
diff --git a/fs/xattr.c b/fs/xattr.c
index 24bd5427d4f4..a71900b5349a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -13,6 +13,7 @@
 #include <linux/xattr.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/init/do_mounts_devfs.c b/init/do_mounts_devfs.c
index e1be9c28f9ef..cc526474690a 100644
--- a/init/do_mounts_devfs.c
+++ b/init/do_mounts_devfs.c
@@ -2,7 +2,6 @@
 #include <linux/kernel.h>
 #include <linux/dirent.h>
 #include <linux/string.h>
-#include <linux/syscalls.h>
 
 #include "do_mounts.h"
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5cfcd4b68cb3..71634b6a08fd 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -22,6 +22,7 @@
 #include <linux/msg.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/syscalls.h>
 #include <net/sock.h>
 #include "util.h"
 
diff --git a/ipc/msg.c b/ipc/msg.c
index a992a1fdb4ec..93d46e9352f6 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -24,6 +24,7 @@
 #include <linux/list.h>
 #include <linux/security.h>
 #include <linux/sched.h>
+#include <linux/syscalls.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
 #include "util.h"
diff --git a/ipc/sem.c b/ipc/sem.c
index 32acee680759..06f756db359a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -71,6 +71,7 @@
 #include <linux/time.h>
 #include <linux/smp_lock.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include "util.h"
 
diff --git a/ipc/shm.c b/ipc/shm.c
index 530fb1f6ae1d..f5ca90c8addb 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -26,6 +26,7 @@
 #include <linux/proc_fs.h>
 #include <linux/shmem_fs.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 
 #include "util.h"
diff --git a/kernel/acct.c b/kernel/acct.c
index 52d49907c0f7..15ab0324139d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -53,6 +53,7 @@
 #include <linux/vfs.h>
 #include <linux/jiffies.h>
 #include <linux/times.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
diff --git a/kernel/capability.c b/kernel/capability.c
index 7e864e2ccf6a..7800a5066c0f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 
 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 5fba35304459..ad3e5d54e119 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
+#include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ca9a9e21c444..031e17486eee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6918cb7460a8..95fbf1c6becf 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
+#include <linux/syscalls.h>
 #include <linux/time.h>
 
 #include <asm/uaccess.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index b3abe97f88a6..fce7f4030d0a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -16,7 +16,6 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/sysrq.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7ba8941b834b..c2dc4f89da41 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -43,6 +43,7 @@
 #include <linux/compiler.h>
 #include <linux/idr.h>
 #include <linux/posix-timers.h>
+#include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 18dba205c171..c02ec626f384 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -30,6 +30,7 @@
 #include <linux/smp.h>
 #include <linux/security.h>
 #include <linux/bootmem.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 01e232255050..77fd68ef15a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -42,6 +42,7 @@
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
+#include <linux/syscalls.h>
 #include <linux/times.h>
 #include <asm/tlb.h>
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 44c0223cca3e..9f6bd72bdd82 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <linux/ptrace.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index e2b57d6f6038..a95e3900dc1e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -24,6 +24,11 @@
 #include <linux/dcookies.h>
 #include <linux/suspend.h>
 
+/* Don't include this - it breaks ia64's cond_syscall() implementation */
+#if 0
+#include <linux/syscalls.h>
+#endif
+
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a0af0ed9a8..469cf0c2f26e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -40,6 +40,7 @@
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
diff --git a/kernel/time.c b/kernel/time.c
index 167a72a078ea..0b4797fa3d30 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,6 +31,7 @@
 #include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 9616e707154e..66486fb30dab 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -31,6 +31,7 @@
 #include <linux/time.h>
 #include <linux/jiffies.h>
 #include <linux/cpu.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/mm/fadvise.c b/mm/fadvise.c
index f479387a5486..57264d74b8bf 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/fadvise.h>
+#include <linux/syscalls.h>
 
 #include <asm/unistd.h>
 
diff --git a/mm/filemap.c b/mm/filemap.c
index ba5b903ff585..3935097dc5cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -27,6 +27,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 /*
  * This is needed for the following functions:
  *  - try_to_release_page
diff --git a/mm/fremap.c b/mm/fremap.c
index fcd615827159..1a1c8489a668 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -14,6 +14,7 @@
 #include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
diff --git a/mm/madvise.c b/mm/madvise.c
index 0439c560e0b4..875a871b73ba 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -7,6 +7,7 @@
 
 #include <linux/mman.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
 
 
 /*
diff --git a/mm/mincore.c b/mm/mincore.c
index 280abef57c5f..54ec41221049 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -11,6 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index 662a52109d36..9cdfcf0036fe 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -7,6 +7,7 @@
 
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/syscalls.h>
 
 
 static int mlock_fixup(struct vm_area_struct * vma, 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 67d02dc0ea04..befda287dff3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -18,6 +18,7 @@
 #include <linux/security.h>
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index 558830585bac..cd8af8a8ba42 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -16,6 +16,7 @@
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/mm/msync.c b/mm/msync.c
index 2aaad0418e62..52f7b1bd8433 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 9cac9ae08bb0..68e6b32dea6a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -19,6 +19,7 @@
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b7ecb1bb2014..55bb488aa10f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-- 
cgit v1.2.3


From 0105467f1b4a7545f2a2c38a82e0b283bb3bc9b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 18 Oct 2004 08:55:12 -0700
Subject: [PATCH] fix the prof=schedule feature

Fix mismerge of the "prof=schedule" feature.  Without this patch the output
is a boring empty profile.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 4 ++--
 kernel/sched.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index cab14764cea0..1c4375fad923 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,14 +22,14 @@ static int __init profile_setup(char * str)
 	int par;
 
 	if (!strncmp(str, "schedule", 8)) {
-		prof_on = 2;
+		prof_on = SCHED_PROFILING;
 		printk(KERN_INFO "kernel schedule profiling enabled\n");
 		if (str[7] == ',')
 			str += 8;
 	}
 	if (get_option(&str,&par)) {
 		prof_shift = par;
-		prof_on = 1;
+		prof_on = CPU_PROFILING;
 		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
 			prof_shift);
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 77fd68ef15a5..1d0b0c13e93e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2647,6 +2647,7 @@ asmlinkage void __sched schedule(void)
 			dump_stack();
 		}
 	}
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
 need_resched:
 	preempt_disable();
@@ -3223,7 +3224,6 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 				policy != SCHED_NORMAL)
 			goto out_unlock;
 	}
-	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-- 
cgit v1.2.3


From 2dbc57298d84fe37d855c8bfa3b280dfd47ded4b Mon Sep 17 00:00:00 2001
From: Gregory Kurz <gkurz@meiosys.com>
Date: Mon, 18 Oct 2004 08:55:24 -0700
Subject: [PATCH] fork() bug invalidates file descriptors

Take a process P1 that spawns a thread T (aka.  a clone with CLONE_FILES).
If P1 forks another process P2 (aka.  not a clone) while T is blocked in a
open() that should return file descriptor FD, then FD will be unusable in
P2.  This leads to strange behaviors in the context of P2: close(FD)
returns EBADF, while dup2(a_valid_fd, FD) returns EBUSY and of course FD is
never returned again by any syscall...

testcase:

#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <asm/page.h>

#define FIFO "/tmp/bug_fifo"
#define FD   0

/*
 * This program is meant to show that calling fork() while a clone spawned
 * with CLONE_FILES is blocked in open() makes a fd number unusable in the
 * child.
 *
 *
 *     Parent               Clone                Child
 *        |
 *   clone(CLONE_FILES)-
---
 kernel/fork.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6fd57c2d22cc..bd33d8a507d7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -762,8 +762,17 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
-		if (f)
+		if (f) {
 			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, newf->open_fds);
+		}
 		*new_fds++ = f;
 	}
 	spin_unlock(&oldf->file_lock);
-- 
cgit v1.2.3


From 653853bcaf83a235f3109d00f4ae4af39f9a5872 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 18 Oct 2004 08:55:37 -0700
Subject: [PATCH] generic irq subsystem: core

The main goal of this patch is to consolidate all the different but still
fundamentally similar arch/*/kernel/irq.c code into the kernel/irq/ subsystem.

There are 4 new files in the kernel/irq/ directory:

 - handle.c: core bits: __do_IRQ() and handle_IRQ_event(),
   callable from arch-specific irq.c code.

 - manage.c: the main driver apis

 - spurious.c: the handling of buggy interrupt sources.

 - autoprobe.c: probing of interrupts - older code but still in use.

 - proc.c: /proc/irq/ code.

 - internals.h for irq-core-internal interfaces not visible to drivers
   nor arch PIC code.

An architecture enables the generic hardirq code by defining
CONFIG_GENERIC_HARDIRQS in its arch Kconfig.  People doing this conversion
should check out the x86/x64/ppc/ppc64 patches for details - the conversion is
quite straightforward but every converted function (i.e.  every function
removed from the arch irq.c) _must_ be matched to the generic version and if
there is any detail that the generic code should do it has to be added to the
generic code.  All of the currently converted 4 architectures were converted
like that, and the generic code was extended/fixed along the way.

Other changes related to this patchset:

 - clean up the irq include files (linux/irq.h, linux/interrupt.h,
   linux/hardirq.h) and consolidate asm-*/[hard]irq.h. Note, to keep all
   non-touched architectures in an untouched state this consolidation is
   done carefully and strictly under CONFIG_GENERIC_HARDIRQS.

   Once the consolidation is done we can do a couple of final cleanups
   to reach the following logical splitup of 3 include files:

     linux/interrupt.h: driver-visible APIs and details
     linux/irq.h:       core irq and arch-PIC code, internals
     asm-*/irq.h:       arch PIC and irq delivery details

   the following include files will likely vanish:

     linux/hardirq.h    merges into linux/irq.h
     asm-*/hardirq.h:   merges into asm-*/irq.h
     asm-*/hw_irq.h:    merges into asm-*/irq.h

   Christoph would like to do these once the current wave of
   cleanups gets in.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hardirq.h   |  42 ++++++
 include/linux/interrupt.h |   9 ++
 include/linux/irq.h       |  16 ++-
 kernel/Makefile           |   1 +
 kernel/irq/Makefile       |   4 +
 kernel/irq/autoprobe.c    | 188 +++++++++++++++++++++++++
 kernel/irq/handle.c       | 204 +++++++++++++++++++++++++++
 kernel/irq/internals.h    |  18 +++
 kernel/irq/manage.c       | 347 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/proc.c         | 155 +++++++++++++++++++++
 kernel/irq/spurious.c     |  96 +++++++++++++
 11 files changed, 1079 insertions(+), 1 deletion(-)
 create mode 100644 kernel/irq/Makefile
 create mode 100644 kernel/irq/autoprobe.c
 create mode 100644 kernel/irq/handle.c
 create mode 100644 kernel/irq/internals.h
 create mode 100644 kernel/irq/manage.c
 create mode 100644 kernel/irq/proc.c
 create mode 100644 kernel/irq/spurious.c

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 7e1aad3947e7..97343be12ad8 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -5,6 +5,40 @@
 #include <linux/smp_lock.h>
 #include <asm/hardirq.h>
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+/*
+ * We put the hardirq and softirq counter into the preemption
+ * counter. The bitmask has the following meaning:
+ *
+ * - bits 0-7 are the preemption count (max preemption depth: 256)
+ * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ * - bits 16-27 are the hardirq count (max # of hardirqs: 4096)
+ *
+ * - ( bit 26 is the PREEMPT_ACTIVE flag. )
+ *
+ * PREEMPT_MASK: 0x000000ff
+ * SOFTIRQ_MASK: 0x0000ff00
+ * HARDIRQ_MASK: 0x0fff0000
+ */
+
+#define PREEMPT_BITS	8
+#define SOFTIRQ_BITS	8
+#define HARDIRQ_BITS	12
+
+#define PREEMPT_SHIFT	0
+#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+
+/*
+ * The hardirq mask has to be large enough to have
+ * space for potentially all IRQ sources in the system
+ * nesting on a single CPU:
+ */
+#if (1 << HARDIRQ_BITS) < NR_IRQS
+# error HARDIRQ_BITS is too low!
+#endif
+#endif /* CONFIG_GENERIC_HARDIRQS */
+
 #define __IRQ_MASK(x)	((1UL << (x))-1)
 
 #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
@@ -43,4 +77,12 @@ extern void synchronize_irq(unsigned int irq);
 # define synchronize_irq(irq)	barrier()
 #endif
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+#define nmi_enter()		(preempt_count() += HARDIRQ_OFFSET)
+#define nmi_exit()		(preempt_count() -= HARDIRQ_OFFSET)
+
+#define irq_enter()		(preempt_count() += HARDIRQ_OFFSET)
+extern void irq_exit(void);
+#endif
+
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7bf37959009..2b0f7331f9c3 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -40,6 +40,8 @@ struct irqaction {
 	const char *name;
 	void *dev_id;
 	struct irqaction *next;
+	int irq;
+	struct proc_dir_entry *dir;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
@@ -48,6 +50,13 @@ extern int request_irq(unsigned int,
 		       unsigned long, const char *, void *);
 extern void free_irq(unsigned int, void *);
 
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern void disable_irq_nosync(unsigned int irq);
+extern void disable_irq(unsigned int irq);
+extern void enable_irq(unsigned int irq);
+#endif
+
 /*
  * Temporary defines for UP kernels, until all code gets fixed.
  */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5bc740d9bc47..012315f5665c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -13,6 +13,7 @@
 
 #if !defined(CONFIG_ARCH_S390)
 
+#include <linux/linkage.h>
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
@@ -71,7 +72,20 @@ extern irq_desc_t irq_desc [NR_IRQS];
 
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
-extern int setup_irq(unsigned int , struct irqaction * );
+extern int setup_irq(unsigned int irq, struct irqaction * new);
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern cpumask_t irq_affinity[NR_IRQS];
+
+extern asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+				       struct irqaction *action);
+extern asmlinkage unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+extern void note_interrupt(unsigned int irq, irq_desc_t *desc, int action_ret);
+extern void report_bad_irq(unsigned int irq, irq_desc_t *desc, int action_ret);
+extern int can_request_irq(unsigned int irq, unsigned long irqflags);
+
+extern void init_irq_proc(void);
+#endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 5e65d330ca29..fd79e9348117 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000000..66d3be6e4da8
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,4 @@
+
+obj-y := autoprobe.o handle.o manage.o spurious.o
+obj-$(CONFIG_PROC_FS) += proc.o
+
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000000..16818726cd21
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,188 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DECLARE_MUTEX(probe_sem);
+
+/**
+ *	probe_irq_on	- begin an interrupt autodetect
+ *
+ *	Commence probing for an interrupt. The interrupts are scanned
+ *	and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+	unsigned long val, delay;
+	irq_desc_t *desc;
+	unsigned int i;
+
+	down(&probe_sem);
+	/*
+	 * something may have generated an irq long ago and we want to
+	 * flush such a longstanding irq before considering it as spurious.
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!irq_desc[i].action)
+			irq_desc[i].handler->startup(i);
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/* Wait for longstanding interrupts to trigger. */
+	for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
+		/* about 20ms delay */ barrier();
+
+	/*
+	 * enable any unassigned irqs
+	 * (we must startup again here because if a longstanding irq
+	 * happened in the previous stage, it may have masked itself)
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!desc->action) {
+			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+			if (desc->handler->startup(i))
+				desc->status |= IRQ_PENDING;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/*
+	 * Wait for spurious interrupts to trigger
+	 */
+	for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
+		/* about 100ms delay */ barrier();
+
+	/*
+	 * Now filter out any obviously spurious interrupts
+	 */
+	val = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			/* It triggered already - consider it spurious. */
+			if (!(status & IRQ_WAITING)) {
+				desc->status = status & ~IRQ_AUTODETECT;
+				desc->handler->shutdown(i);
+			} else
+				if (i < 32)
+					val |= 1 << i;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	return val;
+}
+
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ *	probe_irq_mask - scan a bitmap of interrupt lines
+ *	@val:	mask of interrupts to consider
+ *
+ *	Scan the interrupt lines and return a bitmap of active
+ *	autodetect interrupts. The interrupt probe logic state
+ *	is then returned to its previous value.
+ *
+ *	Note: we need to scan all the irq's even though we will
+ *	only return autodetect irq numbers - just so that we reset
+ *	them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+	unsigned int mask;
+	int i;
+
+	mask = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (i < 16 && !(status & IRQ_WAITING))
+				mask |= 1 << i;
+
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->handler->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	up(&probe_sem);
+
+	return mask & val;
+}
+
+/**
+ *	probe_irq_off	- end an interrupt autodetect
+ *	@val: mask of potential interrupts (unused)
+ *
+ *	Scans the unused interrupt lines and returns the line which
+ *	appears to have triggered the interrupt. If no interrupt was
+ *	found then zero is returned. If more than one interrupt is
+ *	found then minus the first candidate is returned to indicate
+ *	their is doubt.
+ *
+ *	The interrupt probe logic state is returned to its previous
+ *	value.
+ *
+ *	BUGS: When used in a module (which arguably shouldn't happen)
+ *	nothing prevents two IRQ probe callers from overlapping. The
+ *	results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+	int i, irq_found = 0, nr_irqs = 0;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (!(status & IRQ_WAITING)) {
+				if (!nr_irqs)
+					irq_found = i;
+				nr_irqs++;
+			}
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->handler->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	up(&probe_sem);
+
+	if (nr_irqs > 1)
+		irq_found = -irq_found;
+	return irq_found;
+}
+
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000000..58ec49133f78
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,204 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the core interrupt handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the apropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+	[0 ... NR_IRQS-1] = {
+		.handler = &no_irq_type,
+		.lock = SPIN_LOCK_UNLOCKED
+	}
+};
+
+/*
+ * Generic 'no controller' code
+ */
+static void end_none(unsigned int irq) { }
+static void enable_none(unsigned int irq) { }
+static void disable_none(unsigned int irq) { }
+static void shutdown_none(unsigned int irq) { }
+static unsigned int startup_none(unsigned int irq) { return 0; }
+
+static void ack_none(unsigned int irq)
+{
+	/*
+	 * 'what should we do if we get a hw irq event on an illegal vector'.
+	 * each architecture has to answer this themself.
+	 */
+	ack_bad_irq(irq);
+}
+
+struct hw_interrupt_type no_irq_type = {
+	typename:	"none",
+	startup:	startup_none,
+	shutdown:	shutdown_none,
+	enable:		enable_none,
+	disable:	disable_none,
+	ack:		ack_none,
+	end:		end_none,
+	set_affinity:	NULL
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+{
+	return IRQ_NONE;
+}
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+	preempt_count() -= IRQ_EXIT_OFFSET;
+	if (!in_interrupt() && local_softirq_pending())
+		do_softirq();
+	preempt_enable_no_resched();
+}
+
+/*
+ * Have got an event to handle:
+ */
+asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+				struct irqaction *action)
+{
+	int ret, retval = 0, status = 0;
+
+	if (!(action->flags & SA_INTERRUPT))
+		local_irq_enable();
+
+	do {
+		ret = action->handler(irq, action->dev_id, regs);
+		if (ret == IRQ_HANDLED)
+			status |= action->flags;
+		retval |= ret;
+		action = action->next;
+	} while (action);
+
+	if (status & SA_SAMPLE_RANDOM)
+		add_interrupt_randomness(irq);
+	local_irq_disable();
+
+	return retval;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	struct irqaction * action;
+	unsigned int status;
+
+	kstat_this_cpu.irqs[irq]++;
+	if (desc->status & IRQ_PER_CPU) {
+		irqreturn_t action_ret;
+
+		/*
+		 * No locking required for CPU-local interrupts:
+		 */
+		desc->handler->ack(irq);
+		action_ret = handle_IRQ_event(irq, regs, desc->action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		desc->handler->end(irq);
+		return 1;
+	}
+
+	spin_lock(&desc->lock);
+	desc->handler->ack(irq);
+	/*
+	 * REPLAY is when Linux resends an IRQ that was dropped earlier
+	 * WAITING is used by probe to mark irqs that are being tested
+	 */
+	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+	status |= IRQ_PENDING; /* we _want_ to handle it */
+
+	/*
+	 * If the IRQ is disabled for whatever reason, we cannot
+	 * use the action we have.
+	 */
+	action = NULL;
+	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+		action = desc->action;
+		status &= ~IRQ_PENDING; /* we commit to handling */
+		status |= IRQ_INPROGRESS; /* we are handling it */
+	}
+	desc->status = status;
+
+	/*
+	 * If there is no IRQ handler or it was disabled, exit early.
+	 * Since we set PENDING, if another processor is handling
+	 * a different instance of this same irq, the other processor
+	 * will take care of it.
+	 */
+	if (unlikely(!action))
+		goto out;
+
+	/*
+	 * Edge triggered interrupts need to remember
+	 * pending events.
+	 * This applies to any hw interrupts that allow a second
+	 * instance of the same irq to arrive while we are in do_IRQ
+	 * or in the handler. But the code here only handles the _second_
+	 * instance of the irq, not the third or fourth. So it is mostly
+	 * useful for irq hardware that does not mask cleanly in an
+	 * SMP environment.
+	 */
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
+		action_ret = handle_IRQ_event(irq, regs, action);
+
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+
+out:
+	/*
+	 * The ->end() handler has to deal with interrupts which got
+	 * disabled while the handler was running.
+	 */
+	desc->handler->end(irq);
+	spin_unlock(&desc->lock);
+
+	return 1;
+}
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000000..46feba630266
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,18 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_handler_proc(unsigned int irq,
+					 struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+					   struct irqaction *action) { }
+#endif
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000000..a164fe243338
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,347 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	while (desc->status & IRQ_INPROGRESS)
+		cpu_relax();
+}
+
+EXPORT_SYMBOL(synchronize_irq);
+
+#endif
+
+/**
+ *	disable_irq_nosync - disable an irq without waiting
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Disables and Enables are
+ *	nested.
+ *	Unlike disable_irq(), this function does not ensure existing
+ *	instances of the IRQ handler have completed before returning.
+ *
+ *	This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ *	disable_irq - disable an irq and wait for completion
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Enables and Disables are
+ *	nested.
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	disable_irq_nosync(irq);
+	if (desc->action)
+		synchronize_irq(irq);
+}
+
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ *	enable_irq - enable handling of an irq
+ *	@irq: Interrupt to enable
+ *
+ *	Undoes the effect of one call to disable_irq().  If this
+ *	matches the last disable, processing of interrupts on this
+ *	IRQ line is re-enabled.
+ *
+ *	This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	switch (desc->depth) {
+	case 0:
+		WARN_ON(1);
+		break;
+	case 1: {
+		unsigned int status = desc->status & ~IRQ_DISABLED;
+
+		desc->status = status;
+		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+			desc->status = status | IRQ_REPLAY;
+			hw_resend_irq(desc->handler,irq);
+		}
+		desc->handler->enable(irq);
+		/* fall-through */
+	}
+	default:
+		desc->depth--;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(enable_irq);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+	struct irqaction *action;
+
+	if (irq >= NR_IRQS)
+		return 0;
+
+	action = irq_desc[irq].action;
+	if (action)
+		if (irqflags & action->flags & SA_SHIRQ)
+			action = NULL;
+
+	return !action;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+int setup_irq(unsigned int irq, struct irqaction * new)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *old, **p;
+	unsigned long flags;
+	int shared = 0;
+
+	if (desc->handler == &no_irq_type)
+		return -ENOSYS;
+	/*
+	 * Some drivers like serial.c use request_irq() heavily,
+	 * so we have to be careful not to interfere with a
+	 * running system.
+	 */
+	if (new->flags & SA_SAMPLE_RANDOM) {
+		/*
+		 * This function might sleep, we want to call it first,
+		 * outside of the atomic block.
+		 * Yes, this might clear the entropy pool if the wrong
+		 * driver is attempted to be loaded, without actually
+		 * installing a new handler, but is this really a problem,
+		 * only the sysadmin is able to do this.
+		 */
+		rand_initialize_irq(irq);
+	}
+
+	/*
+	 * The following block of code has to be executed atomically
+	 */
+	spin_lock_irqsave(&desc->lock,flags);
+	p = &desc->action;
+	if ((old = *p) != NULL) {
+		/* Can't share interrupts unless both agree to */
+		if (!(old->flags & new->flags & SA_SHIRQ)) {
+			spin_unlock_irqrestore(&desc->lock,flags);
+			return -EBUSY;
+		}
+
+		/* add new interrupt at end of irq queue */
+		do {
+			p = &old->next;
+			old = *p;
+		} while (old);
+		shared = 1;
+	}
+
+	*p = new;
+
+	if (!shared) {
+		desc->depth = 0;
+		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
+				  IRQ_WAITING | IRQ_INPROGRESS);
+		if (desc->handler->startup)
+			desc->handler->startup(irq);
+		else
+			desc->handler->enable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock,flags);
+
+	new->irq = irq;
+	register_irq_proc(irq);
+	new->dir = NULL;
+	register_handler_proc(irq, new);
+
+	return 0;
+}
+
+/**
+ *	free_irq - free an interrupt
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove an interrupt handler. The handler is removed and if the
+ *	interrupt line is no longer in use by any driver it is disabled.
+ *	On a shared IRQ the caller must ensure the interrupt is disabled
+ *	on the card it drives before calling this function. The function
+ *	does not return until any executing interrupts for this IRQ
+ *	have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+	struct irq_desc *desc;
+	struct irqaction **p;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock,flags);
+	p = &desc->action;
+	for (;;) {
+		struct irqaction * action = *p;
+
+		if (action) {
+			struct irqaction **pp = p;
+
+			p = &action->next;
+			if (action->dev_id != dev_id)
+				continue;
+
+			/* Found it - now remove it from the list of entries */
+			*pp = action->next;
+			if (!desc->action) {
+				desc->status |= IRQ_DISABLED;
+				if (desc->handler->shutdown)
+					desc->handler->shutdown(irq);
+				else
+					desc->handler->disable(irq);
+			}
+			spin_unlock_irqrestore(&desc->lock,flags);
+			unregister_handler_proc(irq, action);
+
+			/* Make sure it's not being used on another CPU */
+			synchronize_irq(irq);
+			kfree(action);
+			return;
+		}
+		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+		spin_unlock_irqrestore(&desc->lock,flags);
+		return;
+	}
+}
+
+EXPORT_SYMBOL(free_irq);
+
+/**
+ *	request_irq - allocate an interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs
+ *	@irqflags: Interrupt type flags
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources and enables the
+ *	interrupt line and IRQ handling. From the point this
+ *	call is made your handler function may be invoked. Since
+ *	your handler function must clear any interrupt the board
+ *	raises, you must take care both to initialise your hardware
+ *	and to set up the interrupt handler in the right order.
+ *
+ *	Dev_id must be globally unique. Normally the address of the
+ *	device data structure is used as the cookie. Since the handler
+ *	receives this value it makes sense to use it.
+ *
+ *	If your interrupt is shared you must pass a non NULL dev_id
+ *	as this is required when freeing the interrupt.
+ *
+ *	Flags:
+ *
+ *	SA_SHIRQ		Interrupt is shared
+ *	SA_INTERRUPT		Disable local interrupts while processing
+ *	SA_SAMPLE_RANDOM	The interrupt can be used for entropy
+ *
+ */
+int request_irq(unsigned int irq,
+		irqreturn_t (*handler)(int, void *, struct pt_regs *),
+		unsigned long irqflags, const char * devname, void *dev_id)
+{
+	struct irqaction * action;
+	int retval;
+
+	/*
+	 * Sanity-check: shared interrupts must pass in a real dev-ID,
+	 * otherwise we'll have trouble later trying to figure out
+	 * which interrupt is which (messes up the interrupt freeing
+	 * logic etc).
+	 */
+	if ((irqflags & SA_SHIRQ) && !dev_id)
+		return -EINVAL;
+	if (irq >= NR_IRQS)
+		return -EINVAL;
+	if (!handler)
+		return -EINVAL;
+
+	action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+	if (!action)
+		return -ENOMEM;
+
+	action->handler = handler;
+	action->flags = irqflags;
+	cpus_clear(action->mask);
+	action->name = devname;
+	action->next = NULL;
+	action->dev_id = dev_id;
+
+	retval = setup_irq(irq, action);
+	if (retval)
+		kfree(action);
+
+	return retval;
+}
+
+EXPORT_SYMBOL(request_irq);
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000000..e29a833e065f
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,155 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+
+static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+
+#ifdef CONFIG_SMP
+
+/*
+ * The /proc/irq/<irq>/smp_affinity values:
+ */
+static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
+
+cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+
+static int irq_affinity_read_proc(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+				   unsigned long count, void *data)
+{
+	unsigned int irq = (int)(long)data, full_count = count, err;
+	cpumask_t new_value, tmp;
+
+	if (!irq_desc[irq].handler->set_affinity)
+		return -EIO;
+
+	err = cpumask_parse(buffer, count, new_value);
+	if (err)
+		return err;
+
+	/*
+	 * Do not allow disabling IRQs completely - it's a too easy
+	 * way to make the system unusable accidentally :-) At least
+	 * one online CPU still has to be targeted.
+	 */
+	cpus_and(tmp, new_value, cpu_online_map);
+	if (cpus_empty(tmp))
+		return -EINVAL;
+
+	irq_affinity[irq] = new_value;
+	irq_desc[irq].handler->set_affinity(irq,
+					cpumask_of_cpu(first_cpu(new_value)));
+
+	return full_count;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq)
+{
+	char name [MAX_NAMELEN];
+
+	if (!root_irq_dir ||
+		(irq_desc[irq].handler == &no_irq_type) ||
+			irq_dir[irq])
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	sprintf(name, "%d", irq);
+
+	/* create /proc/irq/1234 */
+	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+	{
+		struct proc_dir_entry *entry;
+
+		/* create /proc/irq/<irq>/smp_affinity */
+		entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+
+		if (entry) {
+			entry->nlink = 1;
+			entry->data = (void *)(long)irq;
+			entry->read_proc = irq_affinity_read_proc;
+			entry->write_proc = irq_affinity_write_proc;
+		}
+		smp_affinity_entry[irq] = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	if (action->dir)
+		remove_proc_entry(action->dir->name, irq_dir[irq]);
+}
+
+void init_irq_proc(void)
+{
+	int i;
+
+	/* create /proc/irq */
+	root_irq_dir = proc_mkdir("irq", NULL);
+	if (!root_irq_dir)
+		return;
+
+	/*
+	 * Create entries for all existing IRQs.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		register_irq_proc(i);
+}
+
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000000..3081611f9a23
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,96 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ *  functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	struct irqaction *action;
+
+	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+		printk(KERN_ERR "irq event %d: bogus return value %x\n",
+				irq, action_ret);
+	} else {
+		printk(KERN_ERR "irq %d: nobody cared!\n", irq);
+	}
+	dump_stack();
+	printk(KERN_ERR "handlers:\n");
+	action = desc->action;
+	while (action) {
+		printk(KERN_ERR "[<%p>]", action->handler);
+		print_symbol(" (%s)",
+			(unsigned long)action->handler);
+		printk("\n");
+		action = action->next;
+	}
+}
+
+void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	static int count = 100;
+
+	if (count > 0) {
+		count--;
+		__report_bad_irq(irq, desc, action_ret);
+	}
+}
+
+void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	if (action_ret != IRQ_HANDLED) {
+		desc->irqs_unhandled++;
+		if (action_ret != IRQ_NONE)
+			report_bad_irq(irq, desc, action_ret);
+	}
+
+	desc->irq_count++;
+	if (desc->irq_count < 100000)
+		return;
+
+	desc->irq_count = 0;
+	if (desc->irqs_unhandled > 99900) {
+		/*
+		 * The interrupt is stuck
+		 */
+		__report_bad_irq(irq, desc, action_ret);
+		/*
+		 * Now kill the IRQ
+		 */
+		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	desc->irqs_unhandled = 0;
+}
+
+int noirqdebug;
+
+static int __init noirqdebug_setup(char *str)
+{
+	noirqdebug = 1;
+	printk(KERN_INFO "IRQ lockup detection disabled\n");
+	return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
-- 
cgit v1.2.3


From 41b12ce3c47df492fe4591e8099d304ace7c809d Mon Sep 17 00:00:00 2001
From: "Suresh B. Siddha" <suresh.b.siddha@intel.com>
Date: Mon, 18 Oct 2004 09:02:26 -0700
Subject: [PATCH] Disable SW irqbalance/irqaffinity for E7520/E7320/E7525 v2

As part of the workaround for the "Interrupt message re-ordering across hub
interface" errata (page #16 in
http://developer.intel.com/design/chipsets/specupdt/30288402.pdf), BIOS may
enable hardware IRQ balancing for E7520/E7320/E7525(revision ID 0x9 and
below) based platforms.

Add pci quirks to disable SW irqbalance/affinity on those platforms.  Move
balanced_irq_init() to late_initcall so that kirqd will be started after
pci quirks.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c |  4 ++--
 drivers/pci/quirks.c       | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/irq.h     |  4 ++++
 include/linux/irq.h        |  2 ++
 include/linux/pci_ids.h    |  2 ++
 kernel/irq/proc.c          |  3 ++-
 kernel/irq/spurious.c      |  2 +-
 7 files changed, 61 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 9b65953b2e51..48fa9cc14f65 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -636,7 +636,7 @@ failed:
 	return 0;
 }
 
-static int __init irqbalance_disable(char *str)
+int __init irqbalance_disable(char *str)
 {
 	irqbalance_disabled = 1;
 	return 0;
@@ -653,7 +653,7 @@ static inline void move_irq(int irq)
 	}
 }
 
-__initcall(balanced_irq_init);
+late_initcall(balanced_irq_init);
 
 #else /* !CONFIG_IRQBALANCE */
 static inline void move_irq(int irq) { }
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 164f738df6a7..faaf5d04c05b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -18,6 +18,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/irq.h>
 
 #undef DEBUG
 
@@ -978,11 +979,58 @@ static void __init quirk_intel_ide_combined(struct pci_dev *pdev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_ANY_ID,	  quirk_intel_ide_combined );
 #endif /* CONFIG_SCSI_SATA */
 
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP)
+
+void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+{
+	u8 config, rev;
+	u32 word;
+	extern struct pci_raw_ops *raw_pci_ops;
+
+	/* BIOS may enable hardware IRQ balancing for
+	 * E7520/E7320/E7525(revision ID 0x9 and below)
+	 * based platforms.
+	 * Disable SW irqbalance/affinity on those platforms.
+	 */
+	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+	if (rev > 0x9)
+		return;
+
+	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
+
+	/* enable access to config space*/
+	pci_read_config_byte(dev, 0xf4, &config);
+	config |= 0x2;
+	pci_write_config_byte(dev, 0xf4, config);
+
+	/* read xTPR register */
+	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+
+	if (!(word & (1 << 13))) {
+		printk(KERN_INFO "Disabling irq balancing and affinity\n");
+#ifdef CONFIG_IRQBALANCE
+		irqbalance_disable("");
+#endif
+		noirqdebug_setup("");
+		no_irq_affinity = 1;
+	}
+
+	config &= ~0x2;
+	/* disable access to config space*/
+	pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
+#endif
+
 int pciehp_msi_quirk;
 
 static void __devinit quirk_pciehp_msi(struct pci_dev *pdev)
 {
 	pciehp_msi_quirk = 1;
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP)
+	quirk_intel_irqbalance(pdev);
+#endif
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,    PCI_DEVICE_ID_INTEL_SMCH,	quirk_pciehp_msi );
 
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 3204b45544f3..05b9e61b0a72 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -34,4 +34,8 @@ extern void release_vm86_irqs(struct task_struct *);
 # define irq_ctx_init(cpu) do { } while (0)
 #endif
 
+#ifdef CONFIG_IRQBALANCE
+extern int irqbalance_disable(char *str);
+#endif
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 012315f5665c..fa3677deff3c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -76,6 +76,8 @@ extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern cpumask_t irq_affinity[NR_IRQS];
+extern int no_irq_affinity;
+extern int noirqdebug_setup(char *str);
 
 extern asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				       struct irqaction *action);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index eed506d0196d..271f3eb29a60 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2205,6 +2205,8 @@
 #define PCI_DEVICE_ID_INTEL_82855GM_HB	0x3580
 #define PCI_DEVICE_ID_INTEL_82855GM_IG	0x3582
 #define PCI_DEVICE_ID_INTEL_SMCH	0x3590
+#define PCI_DEVICE_ID_INTEL_E7320_MCH	0x3592
+#define PCI_DEVICE_ID_INTEL_E7525_MCH	0x359e
 #define PCI_DEVICE_ID_INTEL_80310	0x530d
 #define PCI_DEVICE_ID_INTEL_82371SB_0	0x7000
 #define PCI_DEVICE_ID_INTEL_82371SB_1	0x7010
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e29a833e065f..2ddbe8404c9c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,13 +32,14 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
 	return len;
 }
 
+int no_irq_affinity;
 static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 				   unsigned long count, void *data)
 {
 	unsigned int irq = (int)(long)data, full_count = count, err;
 	cpumask_t new_value, tmp;
 
-	if (!irq_desc[irq].handler->set_affinity)
+	if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
 		return -EIO;
 
 	err = cpumask_parse(buffer, count, new_value);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3081611f9a23..f6297c306905 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -85,7 +85,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 
 int noirqdebug;
 
-static int __init noirqdebug_setup(char *str)
+int __init noirqdebug_setup(char *str)
 {
 	noirqdebug = 1;
 	printk(KERN_INFO "IRQ lockup detection disabled\n");
-- 
cgit v1.2.3


From f871dbd2296a9851d0806c6d0097b4dec6f43f48 Mon Sep 17 00:00:00 2001
From: Vladimir Grouzdev <vladimir.grouzdev@jaluna.com>
Date: Mon, 18 Oct 2004 09:08:10 -0700
Subject: [PATCH] xtime value may become incorrect

The xtime value may become incorrect when the update_wall_time(ticks)
function is called with "ticks" > 1.  In such a case, the xtime variable is
updated multiple times inside the loop but it is normalized only once
outside of the loop.

This bug was reported at:

http://bugme.osdl.org/show_bug.cgi?id=3403

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 66486fb30dab..0a390a67c0f2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -790,13 +790,12 @@ static void update_wall_time(unsigned long ticks)
 	do {
 		ticks--;
 		update_wall_time_one_tick();
+		if (xtime.tv_nsec >= 1000000000) {
+			xtime.tv_nsec -= 1000000000;
+			xtime.tv_sec++;
+			second_overflow();
+		}
 	} while (ticks);
-
-	if (xtime.tv_nsec >= 1000000000) {
-	    xtime.tv_nsec -= 1000000000;
-	    xtime.tv_sec++;
-	    second_overflow();
-	}
 }
 
 static inline void do_process_times(struct task_struct *p,
-- 
cgit v1.2.3


From e4dbd222c7ae76cfd0cd68196fa9574322c3c3bf Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:08:22 -0700
Subject: [PATCH] sched: trivial sched changes

The following patches properly intergrate sched domains and cpu hotplug (using
Nathan's code), by having sched-domains *always* only represent online CPUs,
and having hotplug notifier to keep them up to date.

Then tackle Jesse's domain setup problem: the disjoint top-level domains were
completely broken.  The group-list builder thingy simply can't handle distinct
sets of groups containing the same CPUs.  The code is ugly and specific enough
that I'm re-introducing the arch overridable domains.

I doubt we'll get a proliferation of implementations, because the current
generic code can do the job for everyone but SGI.  I'd rather take a look at
it again down the track if we need to rather than try to shoehorn this into
the generic code.

Nathan and I have tested the hotplug work. He's happy with it.

I've tested the disjoint domain stuff (copied it to i386 for the test), and it
does the right thing on the NUMAQ.  I've asked Jesse to test it as well, but
it should be fine - maybe just help me out and run a test compile on ia64 ;)

This really gets sched domains into much better shape.  Without further ado,
the patches.


This patch:

Make a definition static and slightly sanitize ifdefs.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1d0b0c13e93e..089394b00552 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -310,9 +310,7 @@ struct sched_group {
 
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-	 * single CPU. This should be read only (except for setup). Although
-	 * it will need to be written to at cpu hot(un)plug time, perhaps the
-	 * cpucontrol semaphore will provide enough exclusion?
+	 * single CPU. This is read only (except for setup, hotplug CPU).
 	 */
 	unsigned long cpu_power;
 };
@@ -4248,7 +4246,8 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
  * in arch code. That defines the number of nearby nodes in a node's top
  * level scheduling domain.
  */
-#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN)
+#ifdef CONFIG_NUMA
+#ifdef SD_NODES_PER_DOMAIN
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
@@ -4295,7 +4294,7 @@ static int __init find_next_best_node(int node, unsigned long *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-cpumask_t __init sched_domain_node_span(int node)
+static cpumask_t __init sched_domain_node_span(int node)
 {
 	int i;
 	cpumask_t span;
@@ -4314,12 +4313,13 @@ cpumask_t __init sched_domain_node_span(int node)
 
 	return span;
 }
-#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
-cpumask_t __init sched_domain_node_span(int node)
+#else /* SD_NODES_PER_DOMAIN */
+static cpumask_t __init sched_domain_node_span(int node)
 {
 	return cpu_possible_map;
 }
-#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
+#endif /* SD_NODES_PER_DOMAIN */
+#endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-- 
cgit v1.2.3


From d13d28de21d913aacd3c91e76e307fa2eb7835d8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:08:34 -0700
Subject: [PATCH] sched: add CPU_DOWN_PREPARE notifier

Add a CPU_DOWN_PREPARE hotplug CPU notifier.  This is needed so we can dettach
all sched-domains before a CPU goes down, thus we can build domains from
online cpumasks, and not have to check for the possibility of a CPU coming up
or going down.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/notifier.h | 9 +++++----
 kernel/cpu.c             | 9 +++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 900d43e88005..00c996f08851 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -64,10 +64,11 @@ extern int notifier_call_chain(struct notifier_block **n, unsigned long val, voi
 
 #define NETLINK_URELEASE	0x0001	/* Unicast netlink socket released */
 
-#define CPU_ONLINE	0x0002 /* CPU (unsigned)v is up */
-#define CPU_UP_PREPARE	0x0003 /* CPU (unsigned)v coming up */
-#define CPU_UP_CANCELED	0x0004 /* CPU (unsigned)v NOT coming up */
-#define CPU_DEAD	0x0006 /* CPU (unsigned)v dead */
+#define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
+#define CPU_UP_PREPARE		0x0003 /* CPU (unsigned)v coming up */
+#define CPU_UP_CANCELED		0x0004 /* CPU (unsigned)v NOT coming up */
+#define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
+#define CPU_DEAD		0x0006 /* CPU (unsigned)v dead */
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 64b8ed3cc8ea..84e78a6560bc 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -124,6 +124,15 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
+	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+						(void *)(long)cpu);
+	if (err == NOTIFY_BAD) {
+		printk("%s: attempt to take down CPU %u failed\n",
+				__FUNCTION__, cpu);
+		err = -EINVAL;
+		goto out;
+	}
+
 	/* Ensure that we are not runnable on dying cpu */
 	old_allowed = current->cpus_allowed;
 	tmp = CPU_MASK_ALL;
-- 
cgit v1.2.3


From 5927637cb22349ee77bc4a718e894bdfe22a7420 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:08:46 -0700
Subject: [PATCH] sched: integrate cpu hotplug and sched domains

Register a cpu hotplug notifier which reinitializes the scheduler domains
hierarchy.  The notifier temporarily attaches all running cpus to a "dummy"
domain (like we currently do during boot) to avoid balancing.  It then calls
arch_init_sched_domains which rebuilds the "real" domains and reattaches the
cpus to them.

Also change __init attributes to __devinit where necessary.

Signed-off-by: Nathan Lynch <nathanl@austin.ibm.com>

Alterations from Nick Piggin:

* Detach all domains in CPU_UP|DOWN_PREPARE notifiers. Reinitialise and
  reattach in CPU_ONLINE|DEAD|UP_CANCELED. This ensures the domains as
  seen from the scheduler won't become out of synch with the cpu_online_map.

* This allows us to remove runtime cpu_online verifications. Do that.

* Dummy domains are __devinitdata.

* Remove the hackery in arch_init_sched_domains to work around the fact that
  the domains used to work with cpu_possible maps, but node_to_cpumask returned
  a cpu_online map.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 175 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 104 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 089394b00552..aa0fdc14b3d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1087,8 +1087,7 @@ static int wake_idle(int cpu, task_t *p)
 	if (!(sd->flags & SD_WAKE_IDLE))
 		return cpu;
 
-	cpus_and(tmp, sd->span, cpu_online_map);
-	cpus_and(tmp, tmp, p->cpus_allowed);
+	cpus_and(tmp, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, tmp) {
 		if (idle_cpu(i))
@@ -1640,8 +1639,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	min_cpu = UINT_MAX;
 	min_load = ULONG_MAX;
 
-	cpus_and(mask, sd->span, cpu_online_map);
-	cpus_and(mask, mask, p->cpus_allowed);
+	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
 		load = target_load(i);
@@ -1893,7 +1891,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	max_load = this_load = total_load = total_pwr = 0;
 
 	do {
-		cpumask_t tmp;
 		unsigned long load;
 		int local_group;
 		int i, nr_cpus = 0;
@@ -1902,11 +1899,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp)))
-			goto nextgroup;
 
-		for_each_cpu_mask(i, tmp) {
+		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = target_load(i);
@@ -2025,13 +2019,11 @@ out_balanced:
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group)
 {
-	cpumask_t tmp;
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
-	cpus_and(tmp, group->cpumask, cpu_online_map);
-	for_each_cpu_mask(i, tmp) {
+	for_each_cpu_mask(i, group->cpumask) {
 		load = source_load(i);
 
 		if (load > max_load) {
@@ -2232,18 +2224,13 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 
 	group = sd->groups;
 	do {
-		cpumask_t tmp;
 		runqueue_t *rq;
 		int push_cpu = 0;
 
 		if (group == busy_group)
 			goto next_group;
 
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (!cpus_weight(tmp))
-			goto next_group;
-
-		for_each_cpu_mask(i, tmp) {
+		for_each_cpu_mask(i, group->cpumask) {
 			if (!idle_cpu(i))
 				goto next_group;
 			push_cpu = i;
@@ -2512,7 +2499,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 	 */
 	spin_unlock(&this_rq->lock);
 
-	cpus_and(sibling_map, sd->span, cpu_online_map);
+	sibling_map = sd->span;
 
 	for_each_cpu_mask(i, sibling_map)
 		spin_lock(&cpu_rq(i)->lock);
@@ -2557,7 +2544,7 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 	 * wake_sleeping_dependent():
 	 */
 	spin_unlock(&this_rq->lock);
-	cpus_and(sibling_map, sd->span, cpu_online_map);
+	sibling_map = sd->span;
 	for_each_cpu_mask(i, sibling_map)
 		spin_lock(&cpu_rq(i)->lock);
 	cpu_clear(this_cpu, sibling_map);
@@ -4209,7 +4196,10 @@ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 EXPORT_SYMBOL(kernel_flag);
 
 #ifdef CONFIG_SMP
-/* Attach the domain 'sd' to 'cpu' as its base domain */
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * hold the hotplug lock.
+ */
 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	migration_req_t req;
@@ -4217,8 +4207,6 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 	runqueue_t *rq = cpu_rq(cpu);
 	int local = 1;
 
-	lock_cpu_hotplug();
-
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
@@ -4237,8 +4225,6 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 	}
-
-	unlock_cpu_hotplug();
 }
 
 /*
@@ -4258,7 +4244,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
  *
  * Should use nodemask_t.
  */
-static int __init find_next_best_node(int node, unsigned long *used_nodes)
+static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 
@@ -4294,7 +4280,7 @@ static int __init find_next_best_node(int node, unsigned long *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static cpumask_t __init sched_domain_node_span(int node)
+static cpumask_t __devinit sched_domain_node_span(int node)
 {
 	int i;
 	cpumask_t span;
@@ -4314,7 +4300,7 @@ static cpumask_t __init sched_domain_node_span(int node)
 	return span;
 }
 #else /* SD_NODES_PER_DOMAIN */
-static cpumask_t __init sched_domain_node_span(int node)
+static cpumask_t __devinit sched_domain_node_span(int node)
 {
 	return cpu_possible_map;
 }
@@ -4324,7 +4310,7 @@ static cpumask_t __init sched_domain_node_span(int node)
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-__init static int cpu_to_cpu_group(int cpu)
+static int __devinit cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -4332,7 +4318,7 @@ __init static int cpu_to_cpu_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-__init static int cpu_to_phys_group(int cpu)
+static int __devinit cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -4345,7 +4331,7 @@ __init static int cpu_to_phys_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-__init static int cpu_to_node_group(int cpu)
+static int __devinit cpu_to_node_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
@@ -4355,9 +4341,9 @@ __init static int cpu_to_node_group(int cpu)
 static struct sched_group sched_group_isolated[NR_CPUS];
 
 /* cpus with isolated domains */
-cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE;
+cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
-__init static int cpu_to_isolated_group(int cpu)
+static int __devinit cpu_to_isolated_group(int cpu)
 {
 	return cpu;
 }
@@ -4387,7 +4373,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-__init static void init_sched_build_groups(struct sched_group groups[],
+static void __devinit init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4421,10 +4407,16 @@ __init static void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
-__init static void arch_init_sched_domains(void)
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void __devinit arch_init_sched_domains(void)
 {
 	int i;
 	cpumask_t cpu_default_map;
+	cpumask_t cpu_isolated_online_map;
+
+	cpus_and(cpu_isolated_online_map, cpu_isolated_map, cpu_online_map);
 
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
@@ -4432,10 +4424,10 @@ __init static void arch_init_sched_domains(void)
 	 * exclude other special cases in the future.
 	 */
 	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map);
+	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/* Set up domains */
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
@@ -4447,7 +4439,7 @@ __init static void arch_init_sched_domains(void)
 		 * Unlike those of other cpus, the domains and groups are
 		 * single level, and span a single cpu.
 		 */
-		if (cpu_isset(i, cpu_isolated_map)) {
+		if (cpu_isset(i, cpu_isolated_online_map)) {
 #ifdef CONFIG_SCHED_SMT
 			sd = &per_cpu(cpu_domains, i);
 #else
@@ -4478,11 +4470,7 @@ __init static void arch_init_sched_domains(void)
 		sd = &per_cpu(phys_domains, i);
 		group = cpu_to_phys_group(i);
 		*sd = SD_CPU_INIT;
-#ifdef CONFIG_NUMA
 		sd->span = nodemask;
-#else
-		sd->span = cpu_possible_map;
-#endif
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 
@@ -4500,7 +4488,7 @@ __init static void arch_init_sched_domains(void)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4512,15 +4500,12 @@ __init static void arch_init_sched_domains(void)
 #endif
 
 	/* Set up isolated groups */
-	for_each_cpu_mask(i, cpu_isolated_map) {
-		cpumask_t mask;
-		cpus_clear(mask);
-		cpu_set(i, mask);
+	for_each_cpu_mask(i, cpu_isolated_online_map) {
+		cpumask_t mask = cpumask_of_cpu(i);
 		init_sched_build_groups(sched_group_isolated, mask,
 						&cpu_to_isolated_group);
 	}
 
-#ifdef CONFIG_NUMA
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -4532,10 +4517,6 @@ __init static void arch_init_sched_domains(void)
 		init_sched_build_groups(sched_group_phys, nodemask,
 						&cpu_to_phys_group);
 	}
-#else
-	init_sched_build_groups(sched_group_phys, cpu_possible_map,
-							&cpu_to_phys_group);
-#endif
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
@@ -4568,7 +4549,7 @@ __init static void arch_init_sched_domains(void)
 	}
 
 	/* Attach the domains */
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -4579,21 +4560,25 @@ __init static void arch_init_sched_domains(void)
 	}
 }
 
+static void __devinit arch_destroy_sched_domains(void)
+{
+	/* Do nothing: everything is statically allocated. */
+}
+
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 void sched_domain_debug(void)
 {
 	int i;
 
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		runqueue_t *rq = cpu_rq(i);
 		struct sched_domain *sd;
 		int level = 0;
 
 		sd = rq->sd;
 
-		printk(KERN_DEBUG "CPU%d: %s\n",
-				i, (cpu_online(i) ? " online" : "offline"));
+		printk(KERN_DEBUG "CPU%d:\n", i);
 
 		do {
 			int j;
@@ -4659,10 +4644,60 @@ void sched_domain_debug(void)
 #define sched_domain_debug() {}
 #endif
 
+#ifdef CONFIG_SMP
+/* Initial dummy domain for early boot and for hotplug cpu */
+static __devinitdata struct sched_domain sched_domain_dummy;
+static __devinitdata struct sched_group sched_group_dummy;
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy.  The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to a "dummy" domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	int i;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		for_each_online_cpu(i)
+			cpu_attach_domain(&sched_domain_dummy, i);
+		arch_destroy_sched_domains();
+		return NOTIFY_OK;
+
+	case CPU_UP_CANCELED:
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		/*
+		 * Fall through and re-initialise the domains.
+		 */
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* The hotplug lock is already held by cpu_up/cpu_down */
+	arch_init_sched_domains();
+
+	sched_domain_debug();
+
+	return NOTIFY_OK;
+}
+#endif
+
 void __init sched_init_smp(void)
 {
+	lock_cpu_hotplug();
 	arch_init_sched_domains();
 	sched_domain_debug();
+	unlock_cpu_hotplug();
+	/* XXX: Theoretical race here - CPU may be hotplugged now */
+	hotcpu_notifier(update_sched_domains, 0);
 }
 #else
 void __init sched_init_smp(void)
@@ -4686,20 +4721,18 @@ void __init sched_init(void)
 
 #ifdef CONFIG_SMP
 	/* Set up an initial dummy domain for early boot */
-	static struct sched_domain sched_domain_init;
-	static struct sched_group sched_group_init;
-
-	memset(&sched_domain_init, 0, sizeof(struct sched_domain));
-	sched_domain_init.span = CPU_MASK_ALL;
-	sched_domain_init.groups = &sched_group_init;
-	sched_domain_init.last_balance = jiffies;
-	sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
-	sched_domain_init.busy_factor = 1;
-
-	memset(&sched_group_init, 0, sizeof(struct sched_group));
-	sched_group_init.cpumask = CPU_MASK_ALL;
-	sched_group_init.next = &sched_group_init;
-	sched_group_init.cpu_power = SCHED_LOAD_SCALE;
+
+	memset(&sched_domain_dummy, 0, sizeof(struct sched_domain));
+	sched_domain_dummy.span = CPU_MASK_ALL;
+	sched_domain_dummy.groups = &sched_group_dummy;
+	sched_domain_dummy.last_balance = jiffies;
+	sched_domain_dummy.balance_interval = INT_MAX; /* Don't balance */
+	sched_domain_dummy.busy_factor = 1;
+
+	memset(&sched_group_dummy, 0, sizeof(struct sched_group));
+	sched_group_dummy.cpumask = CPU_MASK_ALL;
+	sched_group_dummy.next = &sched_group_dummy;
+	sched_group_dummy.cpu_power = SCHED_LOAD_SCALE;
 #endif
 
 	for (i = 0; i < NR_CPUS; i++) {
@@ -4712,7 +4745,7 @@ void __init sched_init(void)
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
-		rq->sd = &sched_domain_init;
+		rq->sd = &sched_domain_dummy;
 		rq->cpu_load = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
-- 
cgit v1.2.3


From bfbcbef1125ed2e2f2ea7674eb30231c9039c75f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 18 Oct 2004 09:08:58 -0700
Subject: [PATCH] sched: arch_destroy_sched_domains warning fix

kernel/sched.c:4114: warning: `arch_destroy_sched_domains' defined but not used

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index aa0fdc14b3d4..e357737c730d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4560,10 +4560,12 @@ static void __devinit arch_init_sched_domains(void)
 	}
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 static void __devinit arch_destroy_sched_domains(void)
 {
 	/* Do nothing: everything is statically allocated. */
 }
+#endif
 
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
-- 
cgit v1.2.3


From c8a5254120a0a1a304f49db7dced0a00168e4753 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:09:10 -0700
Subject: [PATCH] sched: sched add load balance flag

Introduce SD_LOAD_BALANCE flag for domains where we don't want to do load
balancing (so we don't have to set up meaningless spans and groups).  Use this
for the initial dummy domain, and just leave isolated CPUs on the dummy
domain.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/processor.h |  3 +-
 kernel/sched.c               | 92 +++++++++++---------------------------------
 2 files changed, 25 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 9e1b61be5c3b..7e0d3322c1fc 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -346,7 +346,8 @@ struct task_struct;
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
-	.flags			= SD_BALANCE_EXEC	\
+	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 10,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index e357737c730d..de42a845d1be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -297,12 +297,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #ifdef CONFIG_SMP
 #define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
 
-#define SD_BALANCE_NEWIDLE	1	/* Balance when about to become idle */
-#define SD_BALANCE_EXEC		2	/* Balance on exec */
-#define SD_WAKE_IDLE		4	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		8	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		16	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	32	/* Domain members share cpu power */
+#define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
+#define SD_BALANCE_EXEC		4	/* Balance on exec */
+#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -2291,8 +2292,12 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	this_rq->cpu_load = (old_load + this_load) / 2;
 
 	for_each_domain(this_cpu, sd) {
-		unsigned long interval = sd->balance_interval;
+		unsigned long interval;
 
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		interval = sd->balance_interval;
 		if (idle != IDLE)
 			interval *= sd->busy_factor;
 
@@ -4337,16 +4342,8 @@ static int __devinit cpu_to_node_group(int cpu)
 }
 #endif
 
-/* Groups for isolated scheduling domains */
-static struct sched_group sched_group_isolated[NR_CPUS];
-
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
-
-static int __devinit cpu_to_isolated_group(int cpu)
-{
-	return cpu;
-}
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4414,9 +4411,6 @@ static void __devinit arch_init_sched_domains(void)
 {
 	int i;
 	cpumask_t cpu_default_map;
-	cpumask_t cpu_isolated_online_map;
-
-	cpus_and(cpu_isolated_online_map, cpu_isolated_map, cpu_online_map);
 
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
@@ -4426,36 +4420,16 @@ static void __devinit arch_init_sched_domains(void)
 	cpus_complement(cpu_default_map, cpu_isolated_map);
 	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
-	/* Set up domains */
-	for_each_online_cpu(i) {
+	/*
+	 * Set up domains. Isolated domains just stay on the dummy domain.
+	 */
+	for_each_cpu_mask(i, cpu_default_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
 		cpus_and(nodemask, nodemask, cpu_default_map);
 
-		/*
-		 * Set up isolated domains.
-		 * Unlike those of other cpus, the domains and groups are
-		 * single level, and span a single cpu.
-		 */
-		if (cpu_isset(i, cpu_isolated_online_map)) {
-#ifdef CONFIG_SCHED_SMT
-			sd = &per_cpu(cpu_domains, i);
-#else
-			sd = &per_cpu(phys_domains, i);
-#endif
-			group = cpu_to_isolated_group(i);
-			*sd = SD_CPU_INIT;
-			cpu_set(i, sd->span);
-			sd->balance_interval = INT_MAX;	/* Don't balance */
-			sd->flags = 0;			/* Avoid WAKE_ */
-			sd->groups = &sched_group_isolated[group];
-			printk(KERN_INFO "Setting up cpu %d isolated.\n", i);
-			/* Single level, so continue with next cpu */
-			continue;
-		}
-
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
@@ -4499,13 +4473,6 @@ static void __devinit arch_init_sched_domains(void)
 	}
 #endif
 
-	/* Set up isolated groups */
-	for_each_cpu_mask(i, cpu_isolated_online_map) {
-		cpumask_t mask = cpumask_of_cpu(i);
-		init_sched_build_groups(sched_group_isolated, mask,
-						&cpu_to_isolated_group);
-	}
-
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -4647,9 +4614,12 @@ void sched_domain_debug(void)
 #endif
 
 #ifdef CONFIG_SMP
-/* Initial dummy domain for early boot and for hotplug cpu */
-static __devinitdata struct sched_domain sched_domain_dummy;
-static __devinitdata struct sched_group sched_group_dummy;
+/*
+ * Initial dummy domain for early boot and for hotplug cpu. Being static,
+ * it is initialized to zero, so all balancing flags are cleared which is
+ * what we want.
+ */
+static struct sched_domain sched_domain_dummy;
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -4721,22 +4691,6 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-#ifdef CONFIG_SMP
-	/* Set up an initial dummy domain for early boot */
-
-	memset(&sched_domain_dummy, 0, sizeof(struct sched_domain));
-	sched_domain_dummy.span = CPU_MASK_ALL;
-	sched_domain_dummy.groups = &sched_group_dummy;
-	sched_domain_dummy.last_balance = jiffies;
-	sched_domain_dummy.balance_interval = INT_MAX; /* Don't balance */
-	sched_domain_dummy.busy_factor = 1;
-
-	memset(&sched_group_dummy, 0, sizeof(struct sched_group));
-	sched_group_dummy.cpumask = CPU_MASK_ALL;
-	sched_group_dummy.next = &sched_group_dummy;
-	sched_group_dummy.cpu_power = SCHED_LOAD_SCALE;
-#endif
-
 	for (i = 0; i < NR_CPUS; i++) {
 		prio_array_t *array;
 
-- 
cgit v1.2.3


From 3f9434fcc91eae13d540224dc791e4d44711c7e5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:09:23 -0700
Subject: [PATCH] sched: remove disjoint NUMA domains setup

Remove the disjoint NUMA domains setup code. It was broken.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 84 +---------------------------------------------------------
 1 file changed, 1 insertion(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index de42a845d1be..24bc1faf92f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4232,86 +4232,6 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 	}
 }
 
-/*
- * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN
- * in arch code. That defines the number of nearby nodes in a node's top
- * level scheduling domain.
- */
-#ifdef CONFIG_NUMA
-#ifdef SD_NODES_PER_DOMAIN
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < numnodes; i++) {
-		/* Start at @node */
-		n = (node + i) % numnodes;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, i);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t __devinit sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	for (i = 0; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		cpumask_t  nodemask;
-
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#else /* SD_NODES_PER_DOMAIN */
-static cpumask_t __devinit sched_domain_node_span(int node)
-{
-	return cpu_possible_map;
-}
-#endif /* SD_NODES_PER_DOMAIN */
-#endif /* CONFIG_NUMA */
-
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4434,9 +4354,7 @@ static void __devinit arch_init_sched_domains(void)
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		/* FIXME: should be multilevel, in arch code */
-		sd->span = sched_domain_node_span(i);
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		sd->span = cpu_default_map;
 		sd->groups = &sched_group_nodes[group];
 #endif
 
-- 
cgit v1.2.3


From d01e5f9376acd6c07d742089dc7785737f5174f9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:09:35 -0700
Subject: [PATCH] sched: make domain setup overridable

Allow sched domain setup to be overridden by arch code. This functionality
is needed again.

From: Paul Jackson <pj@sgi.com>

  Builds of 2.6.9-rc1-mm5 ia64 NUMA configs fail, with many complaints that
  SD_NODE_INIT is defined twice, in asm/processor.h and linux/sched.h.

  I guess that the preprocessor conditionals were wrong when Nick added the
  per-arch override ability again of SD_NODE_INIT were wrong.  At least this
  change lets me rebuild ia64 again.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 148 ++++++++++++++++++++++++++++++++
 kernel/sched.c        | 231 +++++++++++---------------------------------------
 2 files changed, 196 insertions(+), 183 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 389a70ebb189..7f6608ed3ce8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -406,6 +406,154 @@ struct sched_info {
 extern struct file_operations proc_schedstat_operations;
 #endif
 
+enum idle_type
+{
+	SCHED_IDLE,
+	NOT_IDLE,
+	NEWLY_IDLE,
+	MAX_IDLE_TYPES
+};
+
+/*
+ * sched-domains (multiprocessor balancing) declarations:
+ */
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
+
+#define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
+#define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
+#define SD_BALANCE_EXEC		4	/* Balance on exec */
+#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	cpumask_t cpumask;
+
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU. This is read only (except for setup, hotplug CPU).
+	 */
+	unsigned long cpu_power;
+};
+
+struct sched_domain {
+	/* These fields must be setup */
+	struct sched_domain *parent;	/* top domain must be null terminated */
+	struct sched_group *groups;	/* the balancing groups of the domain */
+	cpumask_t span;			/* span of all CPUs in this domain */
+	unsigned long min_interval;	/* Minimum balance interval ms */
+	unsigned long max_interval;	/* Maximum balance interval ms */
+	unsigned int busy_factor;	/* less balancing by factor if busy */
+	unsigned int imbalance_pct;	/* No balance until over watermark */
+	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
+	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+	int flags;			/* See SD_* */
+
+	/* Runtime fields. */
+	unsigned long last_balance;	/* init to jiffies. units in jiffies */
+	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+	unsigned int nr_balance_failed; /* initialise to 0 */
+
+#ifdef CONFIG_SCHEDSTATS
+	/* load_balance() stats */
+	unsigned long lb_cnt[MAX_IDLE_TYPES];
+	unsigned long lb_failed[MAX_IDLE_TYPES];
+	unsigned long lb_imbalance[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
+	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
+
+	/* sched_balance_exec() stats */
+	unsigned long sbe_attempts;
+	unsigned long sbe_pushed;
+
+	/* try_to_wake_up() stats */
+	unsigned long ttwu_wake_affine;
+	unsigned long ttwu_wake_balance;
+#endif
+};
+
+#ifdef ARCH_HAS_SCHED_DOMAIN
+/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
+extern cpumask_t cpu_isolated_map;
+extern void init_sched_build_groups(struct sched_group groups[],
+	                        cpumask_t span, int (*group_fn)(int cpu));
+extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
+#endif
+
+#ifndef ARCH_HAS_SCHED_TUNE
+#ifdef CONFIG_SCHED_SMT
+#define ARCH_HAS_SCHED_WAKE_IDLE
+/* Common values for SMT siblings */
+#define SD_SIBLING_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 1,			\
+	.max_interval		= 2,			\
+	.busy_factor		= 8,			\
+	.imbalance_pct		= 110,			\
+	.cache_hot_time		= 0,			\
+	.cache_nice_tries	= 0,			\
+	.per_cpu_gain		= 25,			\
+	.flags			= SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_EXEC	\
+				| SD_WAKE_AFFINE	\
+				| SD_WAKE_IDLE		\
+				| SD_SHARE_CPUPOWER,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+#endif
+
+/* Common values for CPUs */
+#define SD_CPU_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 1,			\
+	.max_interval		= 4,			\
+	.busy_factor		= 64,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (5*1000/2),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_EXEC	\
+				| SD_WAKE_AFFINE	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+
+#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 8,			\
+	.max_interval		= 32,			\
+	.busy_factor		= 32,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+#endif
+#endif /* ARCH_HAS_SCHED_TUNE */
+#endif /* CONFIG_SMP */
+
+
 struct io_context;			/* See blkdev.h */
 void exit_io_context(void);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 24bc1faf92f9..970a913f64b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -184,16 +184,6 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
-enum idle_type
-{
-	IDLE,
-	NOT_IDLE,
-	NEWLY_IDLE,
-	MAX_IDLE_TYPES
-};
-
-struct sched_domain;
-
 /*
  * These are the runqueue data structures:
  */
@@ -291,139 +281,6 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
-/*
- * sched-domains (multiprocessor balancing) declarations:
- */
-#ifdef CONFIG_SMP
-#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
-
-#define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
-#define SD_BALANCE_EXEC		4	/* Balance on exec */
-#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
-
-struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	cpumask_t cpumask;
-
-	/*
-	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-	 * single CPU. This is read only (except for setup, hotplug CPU).
-	 */
-	unsigned long cpu_power;
-};
-
-struct sched_domain {
-	/* These fields must be setup */
-	struct sched_domain *parent;	/* top domain must be null terminated */
-	struct sched_group *groups;	/* the balancing groups of the domain */
-	cpumask_t span;			/* span of all CPUs in this domain */
-	unsigned long min_interval;	/* Minimum balance interval ms */
-	unsigned long max_interval;	/* Maximum balance interval ms */
-	unsigned int busy_factor;	/* less balancing by factor if busy */
-	unsigned int imbalance_pct;	/* No balance until over watermark */
-	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
-	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
-	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
-	int flags;			/* See SD_* */
-
-	/* Runtime fields. */
-	unsigned long last_balance;	/* init to jiffies. units in jiffies */
-	unsigned int balance_interval;	/* initialise to 1. units in ms. */
-	unsigned int nr_balance_failed; /* initialise to 0 */
-
-#ifdef CONFIG_SCHEDSTATS
-	/* load_balance() stats */
-	unsigned long lb_cnt[MAX_IDLE_TYPES];
-	unsigned long lb_failed[MAX_IDLE_TYPES];
-	unsigned long lb_imbalance[MAX_IDLE_TYPES];
-	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
-	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
-
-	/* sched_balance_exec() stats */
-	unsigned long sbe_attempts;
-	unsigned long sbe_pushed;
-
-	/* try_to_wake_up() stats */
-	unsigned long ttwu_wake_affine;
-	unsigned long ttwu_wake_balance;
-#endif
-};
-
-#ifndef ARCH_HAS_SCHED_TUNE
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#define SD_SIBLING_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 1,			\
-	.max_interval		= 2,			\
-	.busy_factor		= 8,			\
-	.imbalance_pct		= 110,			\
-	.cache_hot_time		= 0,			\
-	.cache_nice_tries	= 0,			\
-	.per_cpu_gain		= 25,			\
-	.flags			= SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
-				| SD_SHARE_CPUPOWER,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-#endif
-
-/* Common values for CPUs */
-#define SD_CPU_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 1,			\
-	.max_interval		= 4,			\
-	.busy_factor		= 64,			\
-	.imbalance_pct		= 125,			\
-	.cache_hot_time		= cache_decay_ticks*1000000 ? : (5*1000000/2),\
-	.cache_nice_tries	= 1,			\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE	\
-				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-
-/* Arch can override this macro in processor.h */
-#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
-#define SD_NODE_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 8,			\
-	.max_interval		= 32,			\
-	.busy_factor		= 32,			\
-	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_BALANCE_EXEC	\
-				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-#endif
-#endif /* ARCH_HAS_SCHED_TUNE */
-#endif
-
-
 #define for_each_domain(cpu, domain) \
 	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 
@@ -502,7 +359,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 
-		for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
+		for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
 			seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
 						    rq->pt_lost[itype]);
 		seq_printf(seq, "\n");
@@ -514,7 +371,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
-			for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
+			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+						itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_failed[itype],
@@ -2006,7 +1864,7 @@ nextgroup:
 
 out_balanced:
 	if (busiest && (idle == NEWLY_IDLE ||
-			(idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+			(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
 		*imbalance = 1;
 		return busiest;
 	}
@@ -2248,7 +2106,7 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 		if (unlikely(busiest == rq))
 			goto next_group;
 		double_lock_balance(busiest, rq);
-		if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
+		if (move_tasks(rq, push_cpu, busiest, 1, sd, SCHED_IDLE)) {
 			schedstat_inc(busiest, alb_lost);
 			schedstat_inc(rq, alb_gained);
 		} else {
@@ -2298,7 +2156,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 			continue;
 
 		interval = sd->balance_interval;
-		if (idle != IDLE)
+		if (idle != SCHED_IDLE)
 			interval *= sd->busy_factor;
 
 		/* scale ms to jiffies */
@@ -2400,7 +2258,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
-		rebalance_tick(cpu, rq, IDLE);
+		rebalance_tick(cpu, rq, SCHED_IDLE);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -4205,7 +4063,7 @@ EXPORT_SYMBOL(kernel_flag);
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	migration_req_t req;
 	unsigned long flags;
@@ -4232,38 +4090,8 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 	}
 }
 
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
 /* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4290,7 +4118,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-static void __devinit init_sched_build_groups(struct sched_group groups[],
+void __devinit init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4324,6 +4152,41 @@ static void __devinit init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
+
+#ifdef ARCH_HAS_SCHED_DOMAIN
+extern void __devinit arch_init_sched_domains(void);
+extern void __devinit arch_destroy_sched_domains(void);
+#else
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+static int __devinit cpu_to_cpu_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+static int __devinit cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+	return first_cpu(cpu_sibling_map[cpu]);
+#else
+	return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static int __devinit cpu_to_node_group(int cpu)
+{
+	return cpu_to_node(cpu);
+}
+#endif
+
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
@@ -4452,9 +4315,11 @@ static void __devinit arch_destroy_sched_domains(void)
 }
 #endif
 
+#endif /* ARCH_HAS_SCHED_DOMAIN */
+
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
-void sched_domain_debug(void)
+static void sched_domain_debug(void)
 {
 	int i;
 
-- 
cgit v1.2.3


From 8dac7706958cbaff568685bba0156380b6131b22 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:10:00 -0700
Subject: [PATCH] sched: fix domain debug for isolcpus

Fix an oops in the domain debug code when isolated CPUs are specified.
Introduced by 5/8 "sched add load balance flag"

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 970a913f64b9..27e6b1b4fac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4344,7 +4344,17 @@ static void sched_domain_debug(void)
 			printk(KERN_DEBUG);
 			for (j = 0; j < level + 1; j++)
 				printk(" ");
-			printk("domain %d: span %s\n", level, str);
+			printk("domain %d: ", level);
+
+			if (!(sd->flags & SD_LOAD_BALANCE)) {
+				printk("does not balance");
+				if (sd->parent)
+					printk(" ERROR !SD_LOAD_BALANCE domain has parent");
+				printk("\n");
+				break;
+			}
+
+			printk("span %s\n", str);
 
 			if (!cpu_isset(i, sd->span))
 				printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
-- 
cgit v1.2.3


From 71da3667be80d30121df3972caa0bf5684228379 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:10:25 -0700
Subject: [PATCH] sched: hotplug add a CPU_DOWN_FAILED notifier

Introduce CPU_DOWN_FAILED notifier, so we can cope with a failure after a
CPU_DOWN_PREPARE notice.

This fixes 3/8 "add CPU_DOWN_PREPARE notifier" to be useful

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/notifier.h | 3 ++-
 kernel/cpu.c             | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 00c996f08851..9303a003e9ab 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -68,7 +68,8 @@ extern int notifier_call_chain(struct notifier_block **n, unsigned long val, voi
 #define CPU_UP_PREPARE		0x0003 /* CPU (unsigned)v coming up */
 #define CPU_UP_CANCELED		0x0004 /* CPU (unsigned)v NOT coming up */
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
-#define CPU_DEAD		0x0006 /* CPU (unsigned)v dead */
+#define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
+#define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 84e78a6560bc..719ba42ff5ee 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -141,6 +141,11 @@ int cpu_down(unsigned int cpu)
 
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	if (IS_ERR(p)) {
+		/* CPU didn't die: tell everyone.  Can't complain. */
+		if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+				(void *)(long)cpu) == NOTIFY_BAD)
+			BUG();
+
 		err = PTR_ERR(p);
 		goto out_allowed;
 	}
-- 
cgit v1.2.3


From 5e641ebaf52e812733ad27b7ebf91291ebe53e0b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 18 Oct 2004 09:10:37 -0700
Subject: [PATCH] sched: use CPU_DOWN_FAILED notifier

Use CPU_DOWN_FAILED notifier in the sched-domains hotplug code.  This goes
with 4/8 "integrate cpu hotplug and sched domains"

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 27e6b1b4fac3..c223ff02fc52 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4436,6 +4436,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 	case CPU_DEAD:
 		/*
-- 
cgit v1.2.3


From c07990160b13946faf43d716a8902f6f7cf55a5e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 18 Oct 2004 09:11:02 -0700
Subject: [PATCH] sched: print preempt count

Better debugging output when the CPU scheduler detects atomicity errors.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c223ff02fc52..2e67de4b356a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2491,7 +2491,9 @@ asmlinkage void __sched schedule(void)
 	 */
 	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "bad: scheduling while atomic!\n");
+			printk(KERN_ERR "scheduling while atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
 			dump_stack();
 		}
 	}
-- 
cgit v1.2.3


From 95a2f6d75d878126b883e87a5fd028e971851c93 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Mon, 18 Oct 2004 09:11:14 -0700
Subject: [PATCH] CPU Scheduler: fix potential error in runqueue
 nr_uninterruptible count

Problem:

In the function try_to_wake_up(), when the runqueue's nr_uninterruptible
field is decremented it's possible (on SMP systems) that the pointer no
longer points to the runqueue that the task being woken was on when it went
to sleep.  This would cause the wrong runqueue's field to be decremented
and the correct one tp remain unchanged.

Fix:

Save a pointer to the old runqueue at the beginning of the function and use
it when decrementing nr_uninterruptible.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2e67de4b356a..3b498fa4f3d7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -981,14 +981,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
-	runqueue_t *rq;
+	runqueue_t *rq, *old_rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
 	struct sched_domain *sd;
 	int new_cpu;
 #endif
 
-	rq = task_rq_lock(p, &flags);
+	old_rq = rq = task_rq_lock(p, &flags);
 	schedstat_inc(rq, ttwu_cnt);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1083,7 +1083,7 @@ out_set_cpu:
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
-		rq->nr_uninterruptible--;
+		old_rq->nr_uninterruptible--;
 		/*
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
-- 
cgit v1.2.3


From 7a9a3e863d834b3c9abf46b7921654f16343b5bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 18 Oct 2004 09:11:52 -0700
Subject: [PATCH] sched: fix SCHED_SMT & numa=fake=2 lockup

This patch fixes an interaction between the numa=fake=<domains> feature,
the domain setup code and cpu_siblings_map[].  The bug leads to a bootup
crash when using numa=fake=2 on a 2-way/4-way SMP+HT box.

When SCHED_SMT is turned on the domains-setup code relies on siblings not
spanning multiple domains (which makes perfect sense).  But numa=fake=2
creates an assymetric 1101/0010 splitup between CPUs, which results in two
siblings being on different nodes.

The patch adds a check_siblings_map() function that checks the sibling maps
and fixes them up if they violate this rule.  (it also prints a warning in
that case.)

The patch also turns SCHED_DOMAIN_DEBUG back on - had this been enabled
we'd have noticed this bug much earlier.

From: Badari Pulavarty <pbadari@us.ibm.com>

  arch/x86_64/mm/numa.c: In function `numa_setup':
  arch/x86_64/mm/numa.c:332: error: `numa_fake' undeclared (first use in this function)
  arch/x86_64/mm/numa.c:332: error: (Each undeclared identifier is reported only once
  arch/x86_64/mm/numa.c:332: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/numa.c |  2 ++
 kernel/sched.c        | 29 ++++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index e6746a26b7a8..df85a94d3ba6 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -264,11 +264,13 @@ __init int numa_setup(char *opt)
 { 
 	if (!strcmp(opt,"off"))
 		numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
 	if(!strncmp(opt, "fake=", 5)) {
 		numa_fake = simple_strtoul(opt+5,NULL,0); ;
 		if (numa_fake >= MAX_NUMNODES)
 			numa_fake = MAX_NUMNODES;
 	}
+#endif
 	return 1;
 } 
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3b498fa4f3d7..5dc237eb32a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4189,6 +4189,30 @@ static int __devinit cpu_to_node_group(int cpu)
 }
 #endif
 
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+/*
+ * The domains setup code relies on siblings not spanning
+ * multiple nodes. Make sure the architecture has a proper
+ * siblings map:
+ */
+static void check_sibling_maps(void)
+{
+	int i, j;
+
+	for_each_online_cpu(i) {
+		for_each_cpu_mask(j, cpu_sibling_map[i]) {
+			if (cpu_to_node(i) != cpu_to_node(j)) {
+				printk(KERN_INFO "warning: CPU %d siblings map "
+					"to different node - isolating "
+					"them.\n", i);
+				cpu_sibling_map[i] = cpumask_of_cpu(i);
+				break;
+			}
+		}
+	}
+}
+#endif
+
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
@@ -4197,6 +4221,9 @@ static void __devinit arch_init_sched_domains(void)
 	int i;
 	cpumask_t cpu_default_map;
 
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -4319,7 +4346,7 @@ static void __devinit arch_destroy_sched_domains(void)
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
-#undef SCHED_DOMAIN_DEBUG
+#define SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(void)
 {
-- 
cgit v1.2.3


From d3069b4dd0767b4e24debdb21b632b2f8dd72474 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 18 Oct 2004 09:12:06 -0700
Subject: [PATCH] fix & clean up zombie/dead task handling & preemption

This patch fixes all the preempt-after-task->state-is-TASK_DEAD problems we
had.  Right now, the moment procfs does a down() that sleeps in
proc_pid_flush() [it could] our TASK_DEAD state is zapped and we might be
back to TASK_RUNNING to and we trigger this assert:

        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
        for (;;) ;

I have split out TASK_ZOMBIE and TASK_DEAD into a separate p->exit_state
field, to allow the detaching of exit-signal/parent/wait-handling from
descheduling a dead task.  Dead-task freeing is done via PF_DEAD.

Tested the patch on x86 SMP and UP, but all architectures should work
fine.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/ptrace.c         |  2 +-
 arch/arm/kernel/ptrace.c           |  2 +-
 arch/arm26/kernel/ptrace.c         |  2 +-
 arch/cris/arch-v10/kernel/ptrace.c |  2 +-
 arch/h8300/kernel/ptrace.c         |  2 +-
 arch/i386/kernel/ptrace.c          |  2 +-
 arch/ia64/kernel/perfmon.c         |  2 +-
 arch/ia64/kernel/ptrace.c          |  2 +-
 arch/m32r/kernel/ptrace.c          |  2 +-
 arch/m68k/kernel/ptrace.c          |  2 +-
 arch/m68knommu/kernel/ptrace.c     |  2 +-
 arch/mips/kernel/ptrace.c          |  2 +-
 arch/mips/kernel/ptrace32.c        |  2 +-
 arch/parisc/kernel/ptrace.c        |  2 +-
 arch/ppc/kernel/ptrace.c           |  2 +-
 arch/ppc64/kernel/ptrace.c         |  2 +-
 arch/ppc64/kernel/ptrace32.c       |  2 +-
 arch/s390/kernel/ptrace.c          |  2 +-
 arch/sh/kernel/ptrace.c            |  2 +-
 arch/sh64/kernel/ptrace.c          |  2 +-
 arch/sparc/kernel/ptrace.c         |  2 +-
 arch/sparc64/kernel/ptrace.c       |  2 +-
 arch/um/kernel/ptrace.c            |  2 +-
 arch/um/kernel/tt/process_kern.c   |  6 +--
 arch/v850/kernel/ptrace.c          |  2 +-
 arch/x86_64/kernel/ptrace.c        |  2 +-
 fs/exec.c                          |  8 ++--
 fs/proc/array.c                    | 14 +++----
 include/linux/sched.h              |  5 ++-
 kernel/exit.c                      | 85 +++++++++++++++++++++-----------------
 kernel/fork.c                      |  3 +-
 kernel/power/process.c             |  4 +-
 kernel/ptrace.c                    |  2 +-
 kernel/sched.c                     | 12 +++---
 kernel/signal.c                    |  4 +-
 kernel/timer.c                     |  2 +-
 36 files changed, 104 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 9c21fbd4f34f..d00583161574 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -355,7 +355,7 @@ do_sys_ptrace(long request, long pid, long addr, long data,
 	 */
 	case PTRACE_KILL:
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)
+		if (child->exit_state == EXIT_ZOMBIE)
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure single-step breakpoint is gone. */
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 7aeb80f8551a..c82f70b5d66c 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -677,7 +677,7 @@ static int do_ptrace(int request, struct task_struct *child, long addr, long dat
 			/* make sure single-step breakpoint is gone. */
 			child->ptrace &= ~PT_SINGLESTEP;
 			ptrace_cancel_bpt(child);
-			if (child->state != TASK_ZOMBIE) {
+			if (child->exit_state != EXIT_ZOMBIE) {
 				child->exit_code = SIGKILL;
 				wake_up_process(child);
 			}
diff --git a/arch/arm26/kernel/ptrace.c b/arch/arm26/kernel/ptrace.c
index e2ffb2007465..8597ab3574fe 100644
--- a/arch/arm26/kernel/ptrace.c
+++ b/arch/arm26/kernel/ptrace.c
@@ -614,7 +614,7 @@ static int do_ptrace(int request, struct task_struct *child, long addr, long dat
 			/* make sure single-step breakpoint is gone. */
 			child->ptrace &= ~PT_SINGLESTEP;
 			ptrace_cancel_bpt(child);
-			if (child->state != TASK_ZOMBIE) {
+			if (child->exit_state != EXIT_ZOMBIE) {
 				child->exit_code = SIGKILL;
 				wake_up_process(child);
 			}
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index 3c7e0a51a282..55fd631224b3 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -176,7 +176,7 @@ sys_ptrace(long request, long pid, long addr, long data)
 		case PTRACE_KILL:
 			ret = 0;
 			
-			if (child->state == TASK_ZOMBIE)
+			if (child->exit_state == EXIT_ZOMBIE)
 				break;
 			
 			child->exit_code = SIGKILL;
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index b60d234d1d27..5f19d774a288 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -192,7 +192,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		case PTRACE_KILL: {
 
 			ret = 0;
-			if (child->state == TASK_ZOMBIE) /* already dead */
+			if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
 			h8300_disable_trace(child);
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index e0c9a03739a8..77c48ce67e08 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -390,7 +390,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		long tmp;
 
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index e86722b7fce7..2685e6ea8bbd 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2593,7 +2593,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 		return -EINVAL;
 	}
 
-	if (task->state == TASK_ZOMBIE) {
+	if (task->exit_state == EXIT_ZOMBIE) {
 		DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
 		return -EBUSY;
 	}
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index d8ff70dfd84b..1e515083e23f 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1422,7 +1422,7 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data,
 		 * sigkill.  Perhaps it should be put in the status
 		 * that it wants to exit.
 		 */
-		if (child->state == TASK_ZOMBIE)		/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)		/* already dead */
 			goto out_tsk;
 		child->exit_code = SIGKILL;
 
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index ab4137a3398a..6d58f972b35c 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -713,7 +713,7 @@ do_ptrace(long request, struct task_struct *child, long addr, long data)
 		ret = 0;
 		unregister_all_debug_traps(child);
 		invalidate_cache();
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index b043a7f9409f..0beb53333ba3 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -277,7 +277,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			long tmp;
 
 			ret = 0;
-			if (child->state == TASK_ZOMBIE) /* already dead */
+			if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
 	/* make sure the single step bit is not set. */
diff --git a/arch/m68knommu/kernel/ptrace.c b/arch/m68knommu/kernel/ptrace.c
index 773dc91e46e9..15cf79080b15 100644
--- a/arch/m68knommu/kernel/ptrace.c
+++ b/arch/m68knommu/kernel/ptrace.c
@@ -264,7 +264,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			long tmp;
 
 			ret = 0;
-			if (child->state == TASK_ZOMBIE) /* already dead */
+			if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
 			/* make sure the single step bit is not set. */
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 6e8511390a57..1ce7bd150940 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -277,7 +277,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 	 */
 	case PTRACE_KILL:
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index df7a51cceeb0..799919d9beaa 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -262,7 +262,7 @@ asmlinkage int sys32_ptrace(int request, int pid, int addr, int data)
 	 */
 	case PTRACE_KILL:
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 598aadd36aae..a9200a3ae6d8 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -303,7 +303,7 @@ long sys_ptrace(long request, pid_t pid, long addr, long data)
 		 * that it wants to exit.
 		 */
 		DBG(("sys_ptrace(KILL)\n"));
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			goto out_tsk;
 		child->exit_code = SIGKILL;
 		goto out_wake_notrap;
diff --git a/arch/ppc/kernel/ptrace.c b/arch/ppc/kernel/ptrace.c
index 5cf5624e90d1..118d97ed10fd 100644
--- a/arch/ppc/kernel/ptrace.c
+++ b/arch/ppc/kernel/ptrace.c
@@ -377,7 +377,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/ppc64/kernel/ptrace.c b/arch/ppc64/kernel/ptrace.c
index aeead8375a5f..0a4011f58f2f 100644
--- a/arch/ppc64/kernel/ptrace.c
+++ b/arch/ppc64/kernel/ptrace.c
@@ -182,7 +182,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
 	 */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/ppc64/kernel/ptrace32.c b/arch/ppc64/kernel/ptrace32.c
index e0ab44dc1fc4..ee81b1b776cc 100644
--- a/arch/ppc64/kernel/ptrace32.c
+++ b/arch/ppc64/kernel/ptrace32.c
@@ -314,7 +314,7 @@ int sys32_ptrace(long request, long pid, unsigned long addr, unsigned long data)
 	 */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 26b89e55d9c1..1acb2d8932b1 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -626,7 +626,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 		 * perhaps it should be put in the status that it wants to 
 		 * exit.
 		 */
-		if (child->state == TASK_ZOMBIE) /* already dead */
+		if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 			return 0;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/sh/kernel/ptrace.c b/arch/sh/kernel/ptrace.c
index 017826912cc5..d14810a459d9 100644
--- a/arch/sh/kernel/ptrace.c
+++ b/arch/sh/kernel/ptrace.c
@@ -217,7 +217,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/sh64/kernel/ptrace.c b/arch/sh64/kernel/ptrace.c
index c48bc4a27b44..f27c696b3e00 100644
--- a/arch/sh64/kernel/ptrace.c
+++ b/arch/sh64/kernel/ptrace.c
@@ -257,7 +257,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c
index 589652ea05b8..62b8fd1dba7f 100644
--- a/arch/sparc/kernel/ptrace.c
+++ b/arch/sparc/kernel/ptrace.c
@@ -567,7 +567,7 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
  * exit.
  */
 	case PTRACE_KILL: {
-		if (child->state == TASK_ZOMBIE) {	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE) {	/* already dead */
 			pt_succ_return(regs, 0);
 			goto out_tsk;
 		}
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index fd73a3ab8f1f..721ab5ba859f 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -559,7 +559,7 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
  * exit.
  */
 	case PTRACE_KILL: {
-		if (child->state == TASK_ZOMBIE) {	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE) {	/* already dead */
 			pt_succ_return(regs, 0);
 			goto out_tsk;
 		}
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 40912d18cb2f..b4efb669be41 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -158,7 +158,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c
index 6224afcd84d7..d547435ee5a5 100644
--- a/arch/um/kernel/tt/process_kern.c
+++ b/arch/um/kernel/tt/process_kern.c
@@ -65,7 +65,7 @@ void *switch_to_tt(void *prev, void *next, void *last)
 		panic("write of switch_pipe failed, err = %d", -err);
 
 	reading = 1;
-	if((from->state == TASK_ZOMBIE) || (from->state == TASK_DEAD))
+	if((from->exit_state == EXIT_ZOMBIE) || (from->exit_state == EXIT_DEAD))
 		os_kill_process(os_getpid(), 0);
 
 	err = os_read_file(from->thread.mode.tt.switch_pipe[0], &c, sizeof(c));
@@ -80,8 +80,8 @@ void *switch_to_tt(void *prev, void *next, void *last)
 	 * in case it has not already killed itself.
 	 */
 	prev_sched = current->thread.prev_sched;
-	if((prev_sched->state == TASK_ZOMBIE) ||
-	   (prev_sched->state == TASK_DEAD))
+	if((prev_sched->exit_state == EXIT_ZOMBIE) ||
+	   (prev_sched->exit_state == EXIT_DEAD))
 		os_kill_process(prev_sched->thread.mode.tt.extern_pid, 1);
 
 	/* This works around a nasty race with 'jail'.  If we are switching
diff --git a/arch/v850/kernel/ptrace.c b/arch/v850/kernel/ptrace.c
index c0ee8f0b7f4c..ab60fa953e74 100644
--- a/arch/v850/kernel/ptrace.c
+++ b/arch/v850/kernel/ptrace.c
@@ -232,7 +232,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
 	 */
 	case PTRACE_KILL:
 		rval = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 73ff5494a6c1..ebd621e96787 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -395,7 +395,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		long tmp;
 
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
 		child->exit_code = SIGKILL;
diff --git a/fs/exec.c b/fs/exec.c
index 125a04e3692b..e715541b2db4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,14 +632,14 @@ static inline int de_thread(struct task_struct *tsk)
 	if (current->pid != current->tgid) {
 		struct task_struct *leader = current->group_leader, *parent;
 		struct dentry *proc_dentry1, *proc_dentry2;
-		unsigned long state, ptrace;
+		unsigned long exit_state, ptrace;
 
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
-		while (leader->state != TASK_ZOMBIE)
+		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
 		spin_lock(&leader->proc_lock);
@@ -683,7 +683,7 @@ static inline int de_thread(struct task_struct *tsk)
 		list_del(&current->tasks);
 		list_add_tail(&current->tasks, &init_task.tasks);
 		current->exit_signal = SIGCHLD;
-		state = leader->state;
+		exit_state = leader->exit_state;
 
 		write_unlock_irq(&tasklist_lock);
 		spin_unlock(&leader->proc_lock);
@@ -691,7 +691,7 @@ static inline int de_thread(struct task_struct *tsk)
 		proc_pid_flush(proc_dentry1);
 		proc_pid_flush(proc_dentry2);
 
-		if (state != TASK_ZOMBIE)
+		if (exit_state != EXIT_ZOMBIE)
 			BUG();
 		release_task(leader);
         }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d05dab0697d3..5c289c73a00e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -137,13 +137,13 @@ static const char *task_state_array[] = {
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
-	unsigned int state = tsk->state & (TASK_RUNNING |
-					   TASK_INTERRUPTIBLE |
-					   TASK_UNINTERRUPTIBLE |
-					   TASK_ZOMBIE |
-					   TASK_DEAD |
-					   TASK_STOPPED |
-					   TASK_TRACED);
+	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_INTERRUPTIBLE |
+					    TASK_UNINTERRUPTIBLE |
+					    TASK_STOPPED |
+					    TASK_TRACED)) |
+			(tsk->exit_state & (EXIT_ZOMBIE |
+					    EXIT_DEAD));
 	const char **p = &task_state_array[0];
 
 	while (state) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69648ed7fd2b..dc3f297a726d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -108,8 +108,8 @@ extern unsigned long nr_iowait(void);
 #define TASK_UNINTERRUPTIBLE	2
 #define TASK_STOPPED		4
 #define TASK_TRACED		8
-#define TASK_ZOMBIE		16
-#define TASK_DEAD		32
+#define EXIT_ZOMBIE		16
+#define EXIT_DEAD		32
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -564,6 +564,7 @@ struct task_struct {
 
 /* task state */
 	struct linux_binfmt *binfmt;
+	long exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
diff --git a/kernel/exit.c b/kernel/exit.c
index 031e17486eee..55d853392524 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -76,7 +76,7 @@ repeat:
 	 */
 	zap_leader = 0;
 	leader = p->group_leader;
-	if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(leader->exit_signal == -1);
 		do_notify_parent(leader, leader->exit_signal);
 		/*
@@ -158,7 +158,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
-				|| p->state >= TASK_ZOMBIE 
+				|| p->exit_state >= EXIT_ZOMBIE
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
@@ -519,7 +519,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
 	 * Make sure we're not reparenting to ourselves and that
 	 * the parent is not a zombie.
 	 */
-	BUG_ON(p == reaper || reaper->state >= TASK_ZOMBIE);
+	BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE);
 	p->real_parent = reaper;
 	if (p->parent == p->real_parent)
 		BUG();
@@ -554,7 +554,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		/* If we'd notified the old parent about this child's death,
 		 * also notify the new parent.
 		 */
-		if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+		if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
 		    thread_group_empty(p))
 			do_notify_parent(p, p->exit_signal);
 		else if (p->state == TASK_TRACED) {
@@ -602,7 +602,7 @@ static inline void forget_original_parent(struct task_struct * father,
 			reaper = child_reaper;
 			break;
 		}
-	} while (reaper->state >= TASK_ZOMBIE);
+	} while (reaper->exit_state >= EXIT_ZOMBIE);
 
 	/*
 	 * There are only two places where our children can be:
@@ -628,7 +628,7 @@ static inline void forget_original_parent(struct task_struct * father,
 		} else {
 			/* reparent ptraced task to its real parent */
 			__ptrace_unlink (p);
-			if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
 			    thread_group_empty(p))
 				do_notify_parent(p, p->exit_signal);
 		}
@@ -639,7 +639,7 @@ static inline void forget_original_parent(struct task_struct * father,
 		 * zombie forever since we prevented it from self-reap itself
 		 * while it was being traced by us, to be able to see it in wait4.
 		 */
-		if (unlikely(ptrace && p->state == TASK_ZOMBIE && p->exit_signal == -1))
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
 			list_add(&p->ptrace_list, to_release);
 	}
 	list_for_each_safe(_p, _n, &father->ptrace_children) {
@@ -752,10 +752,10 @@ static void exit_notify(struct task_struct *tsk)
 		do_notify_parent(tsk, SIGCHLD);
 	}
 
-	state = TASK_ZOMBIE;
+	state = EXIT_ZOMBIE;
 	if (tsk->exit_signal == -1 && tsk->ptrace == 0)
-		state = TASK_DEAD;
-	tsk->state = state;
+		state = EXIT_DEAD;
+	tsk->exit_state = state;
 
 	/*
 	 * Clear these here so that update_process_times() won't try to deliver
@@ -773,7 +773,7 @@ static void exit_notify(struct task_struct *tsk)
 	}
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == TASK_DEAD)
+	if (state == EXIT_DEAD)
 		release_task(tsk);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
@@ -830,6 +830,8 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
+
+	BUG_ON(!(current->flags & PF_DEAD));
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
@@ -973,7 +975,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
 }
 
 /*
- * Handle sys_wait4 work for one task in state TASK_ZOMBIE.  We hold
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
@@ -992,7 +994,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 		int exit_code = p->exit_code;
 		int why, status;
 
-		if (unlikely(p->state != TASK_ZOMBIE))
+		if (unlikely(p->exit_state != EXIT_ZOMBIE))
 			return 0;
 		if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
 			return 0;
@@ -1013,9 +1015,9 @@ static int wait_task_zombie(task_t *p, int noreap,
 	 * Try to move the task's state to DEAD
 	 * only one thread is allowed to do this:
 	 */
-	state = xchg(&p->state, TASK_DEAD);
-	if (state != TASK_ZOMBIE) {
-		BUG_ON(state != TASK_DEAD);
+	state = xchg(&p->exit_state, EXIT_DEAD);
+	if (state != EXIT_ZOMBIE) {
+		BUG_ON(state != EXIT_DEAD);
 		return 0;
 	}
 	if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
@@ -1060,7 +1062,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 
 	/*
 	 * Now we are sure this task is interesting, and no other
-	 * thread can reap it because we set its state to TASK_DEAD.
+	 * thread can reap it because we set its state to EXIT_DEAD.
 	 */
 	read_unlock(&tasklist_lock);
 
@@ -1092,7 +1094,8 @@ static int wait_task_zombie(task_t *p, int noreap,
 	if (!retval && infop)
 		retval = put_user(p->uid, &infop->si_uid);
 	if (retval) {
-		p->state = TASK_ZOMBIE;
+		// TODO: is this safe?
+		p->exit_state = EXIT_ZOMBIE;
 		return retval;
 	}
 	retval = p->pid;
@@ -1101,7 +1104,8 @@ static int wait_task_zombie(task_t *p, int noreap,
 		/* Double-check with lock held.  */
 		if (p->real_parent != p->parent) {
 			__ptrace_unlink(p);
-			p->state = TASK_ZOMBIE;
+			// TODO: is this safe?
+			p->exit_state = EXIT_ZOMBIE;
 			/*
 			 * If this is not a detached task, notify the parent.
 			 * If it's still not detached after that, don't release
@@ -1172,13 +1176,13 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
 	/*
 	 * This uses xchg to be atomic with the thread resuming and setting
 	 * it.  It must also be done with the write lock held to prevent a
-	 * race with the TASK_ZOMBIE case.
+	 * race with the EXIT_ZOMBIE case.
 	 */
 	exit_code = xchg(&p->exit_code, 0);
-	if (unlikely(p->state >= TASK_ZOMBIE)) {
+	if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {
 		/*
 		 * The task resumed and then died.  Let the next iteration
-		 * catch it in TASK_ZOMBIE.  Note that exit_code might
+		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
 		 * already be zero here if it resumed and did _exit(0).
 		 * The task itself is dead and won't touch exit_code again;
 		 * other processors in this function are locked out.
@@ -1339,23 +1343,28 @@ repeat:
 				if (retval != 0) /* He released the lock.  */
 					goto end;
 				break;
-			case TASK_ZOMBIE:
-				/*
-				 * Eligible but we cannot release it yet:
-				 */
-				if (ret == 2)
-					goto check_continued;
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(
-					p, (options & WNOWAIT),
-					infop, stat_addr, ru);
-				if (retval != 0) /* He released the lock.  */
-					goto end;
-				break;
-			case TASK_DEAD:
-				continue;
 			default:
+			// case EXIT_DEAD:
+				if (p->exit_state == EXIT_DEAD)
+					continue;
+			// case EXIT_ZOMBIE:
+				if (p->exit_state == EXIT_ZOMBIE) {
+					/*
+					 * Eligible but we cannot release
+					 * it yet:
+					 */
+					if (ret == 2)
+						goto check_continued;
+					if (!likely(options & WEXITED))
+						continue;
+					retval = wait_task_zombie(
+						p, (options & WNOWAIT),
+						infop, stat_addr, ru);
+					/* He released the lock.  */
+					if (retval != 0)
+						goto end;
+					break;
+				}
 check_continued:
 				if (!unlikely(options & WCONTINUED))
 					continue;
diff --git a/kernel/fork.c b/kernel/fork.c
index bd33d8a507d7..3020dccc548f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(free_task);
 
 void __put_task_struct(struct task_struct *tsk)
 {
-	WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE)));
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
@@ -1062,6 +1062,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	/* ok, now we should be set up.. */
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
+	p->exit_state = 0;
 
 	/* Perform scheduler related setup */
 	sched_fork(p);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index bda013de59a5..78d92dc6a1ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -23,8 +23,8 @@ static inline int freezeable(struct task_struct * p)
 {
 	if ((p == current) || 
 	    (p->flags & PF_NOFREEZE) ||
-	    (p->state == TASK_ZOMBIE) ||
-	    (p->state == TASK_DEAD) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD) ||
 	    (p->state == TASK_STOPPED) ||
 	    (p->state == TASK_TRACED))
 		return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 09ba057222c3..c3ac348e2368 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	write_lock_irq(&tasklist_lock);
 	__ptrace_unlink(child);
 	/* .. and wake it up. */
-	if (child->state != TASK_ZOMBIE)
+	if (child->exit_state != EXIT_ZOMBIE)
 		wake_up_process(child);
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 5dc237eb32a5..8e5e2af64509 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1329,10 +1329,10 @@ static void finish_task_switch(task_t *prev)
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
-	 * schedule one last time. The schedule call will never return,
+	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+	 * calls schedule one last time. The schedule call will never return,
 	 * and the scheduled task must drop that reference.
-	 * The test for TASK_ZOMBIE must occur while the runqueue locks are
+	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
@@ -2489,7 +2489,7 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+	if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "scheduling while atomic: "
 				"%s/0x%08x/%d\n",
@@ -2531,6 +2531,8 @@ need_resched:
 
 	spin_lock_irq(&rq->lock);
 
+	if (unlikely(current->flags & PF_DEAD))
+		current->state = EXIT_DEAD;
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
@@ -3920,7 +3922,7 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
 	struct runqueue *rq = cpu_rq(dead_cpu);
 
 	/* Must be exiting, otherwise would be on tasklist. */
-	BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD);
+	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(tsk->flags & PF_DEAD);
diff --git a/kernel/signal.c b/kernel/signal.c
index 9f6bd72bdd82..f67390806d73 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -914,7 +914,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 	 * Don't bother zombies and stopped tasks (but
 	 * SIGKILL will punch through stopped state)
 	 */
-	mask = TASK_DEAD | TASK_ZOMBIE | TASK_TRACED;
+	mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED;
 	if (sig != SIGKILL)
 		mask |= TASK_STOPPED;
 
@@ -1070,7 +1070,7 @@ void zap_other_threads(struct task_struct *p)
 		/*
 		 * Don't bother with already dead threads
 		 */
-		if (t->state & (TASK_ZOMBIE|TASK_DEAD))
+		if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD))
 			continue;
 
 		/*
diff --git a/kernel/timer.c b/kernel/timer.c
index 0a390a67c0f2..e3c9b5fcd52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -805,7 +805,7 @@ static inline void do_process_times(struct task_struct *p,
 
 	psecs = (p->utime += user);
 	psecs += (p->stime += system);
-	if (!unlikely(p->state & (TASK_DEAD|TASK_ZOMBIE)) &&
+	if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) &&
 	    psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
 		/* Send SIGXCPU every second.. */
 		if (!(psecs % HZ))
-- 
cgit v1.2.3


From c1e08b84eb04f3485c9353e7d909d75f65946039 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@ppc970.osdl.org>
Date: Mon, 18 Oct 2004 09:58:48 -0700
Subject: Don't use obsolete gcc named initializer syntax.

The proper C99 syntax is much preferred.
---
 kernel/irq/handle.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 58ec49133f78..6f0ebc254e3e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,14 +54,14 @@ static void ack_none(unsigned int irq)
 }
 
 struct hw_interrupt_type no_irq_type = {
-	typename:	"none",
-	startup:	startup_none,
-	shutdown:	shutdown_none,
-	enable:		enable_none,
-	disable:	disable_none,
-	ack:		ack_none,
-	end:		end_none,
-	set_affinity:	NULL
+	.typename = 	"none",
+	.startup = 	startup_none,
+	.shutdown = 	shutdown_none,
+	.enable = 	enable_none,
+	.disable = 	disable_none,
+	.ack = 		ack_none,
+	.end = 		end_none,
+	.set_affinity = NULL
 };
 
 /*
-- 
cgit v1.2.3