From 511ca0be9d97abfefbce4cde77b2332c9d8762db Mon Sep 17 00:00:00 2001
From: "John L. Byrne" <john.l.byrne@hp.com>
Date: Wed, 13 Oct 2004 07:26:01 -0700
Subject: [PATCH] fix oops in fork() cleanup path

It will oops on an error path if the thread being forked is a process with
a NULL mm.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8c7ba481c9a5..7e73e420441e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1146,7 +1146,8 @@ fork_out:
 bad_fork_cleanup_namespace:
 	exit_namespace(p);
 bad_fork_cleanup_mm:
-	mmput(p->mm);
+	if (p->mm)
+		mmput(p->mm);
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
-- 
cgit v1.2.3


From b9877c907d56b803b5b0241c2465ce768809fce9 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Wed, 13 Oct 2004 07:27:49 -0700
Subject: [PATCH] Fix reporting of process start times

Derive process start times from the posix_clock_monotonic notion of uptime
instead of "jiffies", consistent with the earlier change to /proc/uptime
itself.
(http://linus.bkbits.net:8080/linux-2.5/cset@3ef4851dGg0fxX58R9Zv8SIq9fzNmQ?na%0Av=index.html|src/.|src/fs|src/fs/proc|related/fs/proc/proc_misc.c)

Process start times are reported to userspace in units of 1/USER_HZ since
boot, thus applications as procps need the value of "uptime" to convert
them into absolute time.

Currently "uptime" is derived from an ntp-corrected time base, but process
start time is derived from the free-running "jiffies" counter.  This
results in inaccurate, drifting process start times as seen by the user,
even if the exported number stays constant, because the users notion of
"jiffies" changes in time.

It's John Stultz's patch anyways, which I only messed up a bit, but since
people started trading signed-off lines on lkml:

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/array.c       |  6 +++++-
 include/linux/acct.h  | 23 +++++++++++++++--------
 include/linux/sched.h |  2 +-
 include/linux/times.h | 20 ++++++++++++++++++++
 kernel/acct.c         | 10 +++++++++-
 kernel/fork.c         |  2 +-
 mm/oom_kill.c         | 19 +++++++++++++------
 7 files changed, 64 insertions(+), 18 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index fc5c7846df32..272908775622 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -360,7 +360,11 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	read_unlock(&tasklist_lock);
 
 	/* Temporary variable needed for gcc-2.96 */
-	start_time = jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES);
+	/* convert timespec -> nsec*/
+	start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
+				+ task->start_time.tv_nsec;
+	/* convert nsec -> ticks */
+	start_time = nsec_to_clock_t(start_time);
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
diff --git a/include/linux/acct.h b/include/linux/acct.h
index b46ce1ac1c6a..a6ab17c49aa1 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -172,17 +172,24 @@ static inline u32 jiffies_to_AHZ(unsigned long x)
 #endif
 }
 
-static inline u64 jiffies_64_to_AHZ(u64 x)
+static inline u64 nsec_to_AHZ(u64 x)
 {
-#if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0
-#if HZ != AHZ
-	do_div(x, HZ / AHZ);
-#endif
-#else
-	x *= TICK_NSEC;
+#if (NSEC_PER_SEC % AHZ) == 0
 	do_div(x, (NSEC_PER_SEC / AHZ));
+#elif (AHZ % 512) == 0
+	x *= AHZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/*
+         * max relative error 5.7e-8 (1.8s per year) for AHZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for AHZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (AHZ/2))
+	                          / AHZ));
 #endif
-       return x;
+	return x;
 }
 
 #endif  /* __KERNEL */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90f5cb645116..8810b551082a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -508,7 +508,7 @@ struct task_struct {
 	struct timer_list real_timer;
 	unsigned long utime, stime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
-	u64 start_time;
+	struct timespec start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 /* process credentials */
diff --git a/include/linux/times.h b/include/linux/times.h
index ff00f334ffaa..0c5aa078dad4 100644
--- a/include/linux/times.h
+++ b/include/linux/times.h
@@ -55,6 +55,26 @@ static inline u64 jiffies_64_to_clock_t(u64 x)
 }
 #endif
 
+static inline u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+	do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+	x *= USER_HZ/512;
+	do_div(x, (NSEC_PER_SEC / 512));
+#else
+	/*
+         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+	x *= 9;
+	do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2))
+	                          / USER_HZ));
+#endif
+	return x;
+}
+
 struct tms {
 	clock_t tms_utime;
 	clock_t tms_stime;
diff --git a/kernel/acct.c b/kernel/acct.c
index daf23c4efab4..fb6989a34f6e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -384,6 +384,8 @@ static void do_acct_process(long exitcode, struct file *file)
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +403,13 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+					+ current->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else
diff --git a/kernel/fork.c b/kernel/fork.c
index 7e73e420441e..70f604c3937b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -992,7 +992,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->io_wait = NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 48f6dde410b3..3868e29e85be 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -41,7 +42,7 @@
  *    of least surprise ... (be careful when you change it)
  */
 
-static unsigned long badness(struct task_struct *p)
+static unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 
@@ -56,12 +57,16 @@ static unsigned long badness(struct task_struct *p)
 	points = p->mm->total_vm;
 
 	/*
-	 * CPU time is in seconds and run time is in minutes. There is no
-	 * particular reason for this other than that it turned out to work
-	 * very well in practice.
+	 * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
 	 */
 	cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-	run_time = (get_jiffies_64() - p->start_time) >> (SHIFT_HZ + 10);
+
+	if (uptime >= p->start_time.tv_sec)
+		run_time = (uptime - p->start_time.tv_sec) >> 10;
+	else
+		run_time = 0;
 
 	s = int_sqrt(cpu_time);
 	if (s)
@@ -111,10 +116,12 @@ static struct task_struct * select_bad_process(void)
 	unsigned long maxpoints = 0;
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
+	struct timespec uptime;
 
+	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
 		if (p->pid) {
-			unsigned long points = badness(p);
+			unsigned long points = badness(p, uptime.tv_sec);
 			if (points > maxpoints) {
 				chosen = p;
 				maxpoints = points;
-- 
cgit v1.2.3


From 31180071ee5e6cc6ff4d036d655c556f582f74e4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 18 Oct 2004 08:53:09 -0700
Subject: [PATCH] make rlimit settings per-process instead of per-thread

POSIX specifies that the limit settings provided by getrlimit/setrlimit are
shared by the whole process, not specific to individual threads.  This
patch changes the behavior of those calls to comply with POSIX.

I've moved the struct rlimit array from task_struct to signal_struct, as it
has the correct sharing properties.  (This reduces kernel memory usage per
thread in multithreaded processes by around 100/200 bytes for 32/64
machines respectively.)  I took a fairly minimal approach to the locking
issues with the newly shared struct rlimit array.  It turns out that all
the code that is checking limits really just needs to look at one word at a
time (one rlim_cur field, usually).  It's only the few places like
getrlimit itself (and fork), that require atomicity in accessing a whole
struct rlimit, so I just used a spin lock for them and no locking for most
of the checks.  If it turns out that readers of struct rlimit need more
atomicity where they are now cheap, or less overhead where they are now
atomic (e.g. fork), then seqcount is certainly the right thing to use for
them instead of readers using the spin lock.  Though it's in signal_struct,
I didn't use siglock since the access to rlimits never needs to disable
irqs and doesn't overlap with other siglock uses.  Instead of adding
something new, I overloaded task_lock(task->group_leader) for this; it is
used for other things that are not likely to happen simultaneously with
limit tweaking.  To me that seems preferable to adding a word, but it would
be trivial (and arguably cleaner) to add a separate lock for these users
(or e.g. just use seqlock, which adds two words but is optimal for readers).

Most of the changes here are just the trivial s/->rlim/->signal->rlim/.

I stumbled across what must be a long-standing bug, in reparent_to_init.
It does:
	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
when surely it was intended to be:
	memcpy(current->rlim, init_task.rlim, sizeof(current->rlim));
As rlim is an array, the * in the sizeof expression gets the size of the
first element, so this just changes the first limit (RLIMIT_CPU).  This is
for kernel threads, where it's clear that resetting all the rlimits is what
you want.  With that fixed, the setting of RLIMIT_FSIZE in nfsd is
superfluous since it will now already have been reset to RLIM_INFINITY.

The other subtlety is removing:
	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
in exit_notify, which was to avoid a race signalling during self-reaping
exit.  As the limit is now shared, a dying thread should not change it for
others.  Instead, I avoid that race by checking current->state before the
RLIMIT_CPU check.  (Adding one new conditional in that path is now required
one way or another, since if not for this check there would also be a new
race with self-reaping exit later on clearing current->signal that would
have to be checked for.)

The one loose end left by this patch is with process accounting.
do_acct_process temporarily resets the RLIMIT_FSIZE limit while writing the
accounting record.  I left this as it was, but it is now changing a limit
that might be shared by other threads still running.  I left this in a
dubious state because it seems to me that processing accounting may already
be more generally a dubious state when it comes to NPTL threads.  I would
think you would want one record per process, with aggregate data about all
threads that ever lived in it, not a separate record for each thread.
I don't use process accounting myself, but if anyone is interested in
testing it out I could provide a patch to change it this way.

One final note, this is not 100% to POSIX compliance in regards to rlimits.
POSIX specifies that RLIMIT_CPU refers to a whole process in aggregate, not
to each individual thread.  I will provide patches later on to achieve that
change, assuming this patch goes in first.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/mmap.c                 |  4 ++--
 arch/ia64/kernel/perfmon.c          |  3 ++-
 arch/ia64/kernel/sys_ia64.c         |  2 +-
 arch/ia64/mm/fault.c                |  4 ++--
 arch/ia64/mm/init.c                 |  2 +-
 arch/mips/kernel/irixelf.c          |  2 +-
 arch/mips/kernel/sysirix.c          | 19 ++++++++++++-------
 arch/ppc64/mm/mmap.c                |  4 ++--
 arch/s390/mm/mmap.c                 |  4 ++--
 arch/sparc/kernel/sys_sunos.c       |  2 +-
 arch/sparc64/kernel/binfmt_aout32.c |  6 +++---
 arch/sparc64/kernel/sys_sunos32.c   |  2 +-
 arch/sparc64/solaris/fs.c           | 16 ++++++++--------
 arch/x86_64/ia32/ia32_aout.c        |  6 +++---
 fs/binfmt_aout.c                    | 10 +++++-----
 fs/binfmt_elf.c                     |  2 +-
 fs/binfmt_flat.c                    |  2 +-
 fs/buffer.c                         |  2 +-
 fs/exec.c                           |  4 ++--
 fs/fcntl.c                          |  6 +++---
 fs/nfs/direct.c                     |  2 +-
 fs/nfsd/nfssvc.c                    |  1 -
 fs/open.c                           |  2 +-
 fs/proc/array.c                     |  4 +++-
 include/linux/init_task.h           |  2 +-
 include/linux/mm.h                  |  2 +-
 include/linux/sched.h               | 13 +++++++++++--
 include/linux/security.h            |  2 +-
 ipc/mqueue.c                        |  2 +-
 kernel/acct.c                       |  6 +++---
 kernel/exit.c                       |  4 ++--
 kernel/fork.c                       | 10 +++++++---
 kernel/signal.c                     |  4 ++--
 kernel/sys.c                        | 27 ++++++++++++++++-----------
 kernel/timer.c                      |  5 +++--
 mm/filemap.c                        |  2 +-
 mm/memory.c                         |  2 +-
 mm/mlock.c                          |  6 +++---
 mm/mmap.c                           | 18 +++++++++---------
 mm/mremap.c                         |  4 ++--
 mm/nommu.c                          |  2 +-
 security/selinux/hooks.c            |  6 +++---
 42 files changed, 127 insertions(+), 101 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c
index a6270ee14323..c5e0d0119a1e 100644
--- a/arch/i386/mm/mmap.c
+++ b/arch/i386/mm/mmap.c
@@ -37,7 +37,7 @@
 
 static inline unsigned long mmap_base(struct mm_struct *mm)
 {
-	unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur;
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
@@ -59,7 +59,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	 */
 	if (sysctl_legacy_va_layout ||
 			(current->personality & ADDR_COMPAT_LAYOUT) ||
-			current->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
+			current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 		mm->unmap_area = arch_unmap_area;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 204e42037e70..e86722b7fce7 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2287,7 +2287,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
 	 * 	return -ENOMEM;
 	 */
-	if (size > task->rlim[RLIMIT_MEMLOCK].rlim_cur) return -ENOMEM;
+	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+		return -EAGAIN;
 
 	/*
 	 * We do the easy to undo allocations first.
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 88d8b217f105..73d6773d5609 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -138,7 +138,7 @@ ia64_brk (unsigned long brk)
 		goto out;
 
 	/* Check against rlimit.. */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 		goto out;
 
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index de73e3c91213..9e4b5c2df50e 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -32,8 +32,8 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address)
 	unsigned long grow;
 
 	grow = PAGE_SIZE >> PAGE_SHIFT;
-	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur
-	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur))
+	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
+	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
 		return -ENOMEM;
 	vma->vm_end += PAGE_SIZE;
 	vma->vm_mm->total_vm += grow;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 87a8016f58c5..3bfbb7dc1f31 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -98,7 +98,7 @@ update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
 inline void
 ia64_set_rbs_bot (void)
 {
-	unsigned long stack_size = current->rlim[RLIMIT_STACK].rlim_max & -16;
+	unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
 
 	if (stack_size > MAX_USER_STACK_SIZE)
 		stack_size = MAX_USER_STACK_SIZE;
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index dc9f7924e024..4858adc76fb1 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -1055,7 +1055,7 @@ static int irix_core_dump(long signr, struct pt_regs * regs, struct file *file)
 	struct vm_area_struct *vma;
 	struct elfhdr elf;
 	off_t offset = 0, dataoff;
-	int limit = current->rlim[RLIMIT_CORE].rlim_cur;
+	int limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote = 4;
 	struct memelfnote notes[4];
 	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 392c73853fa6..4ddbeed9d53e 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -128,16 +128,21 @@ asmlinkage int irix_prctl(struct pt_regs *regs)
 		if (value > RLIM_INFINITY)
 			value = RLIM_INFINITY;
 		if (capable(CAP_SYS_ADMIN)) {
-			current->rlim[RLIMIT_STACK].rlim_max =
-				current->rlim[RLIMIT_STACK].rlim_cur = value;
+			task_lock(current->group_leader);
+			current->signal->rlim[RLIMIT_STACK].rlim_max =
+				current->signal->rlim[RLIMIT_STACK].rlim_cur = value;
+			task_unlock(current->group_leader);
 			error = value;
 			break;
 		}
-		if (value > current->rlim[RLIMIT_STACK].rlim_max) {
+		task_lock(current->group_leader);
+		if (value > current->signal->rlim[RLIMIT_STACK].rlim_max) {
 			error = -EINVAL;
+			task_unlock(current->group_leader);
 			break;
 		}
-		current->rlim[RLIMIT_STACK].rlim_cur = value;
+		current->signal->rlim[RLIMIT_STACK].rlim_cur = value;
+		task_unlock(current->group_leader);
 		error = value;
 		break;
 	}
@@ -145,7 +150,7 @@ asmlinkage int irix_prctl(struct pt_regs *regs)
 	case PR_GETSTACKSIZE:
 		printk("irix_prctl[%s:%d]: Wants PR_GETSTACKSIZE\n",
 		       current->comm, current->pid);
-		error = current->rlim[RLIMIT_STACK].rlim_cur;
+		error = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 		break;
 
 	case PR_MAXPPROCS:
@@ -558,7 +563,7 @@ asmlinkage int irix_brk(unsigned long brk)
 	/*
 	 * Check against rlimit and stack..
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (brk - mm->end_code > rlim) {
@@ -2132,7 +2137,7 @@ asmlinkage int irix_ulimit(int cmd, int arg)
 		retval = -EINVAL;
 		goto out;
 #endif
-		retval = current->rlim[RLIMIT_NOFILE].rlim_cur;
+		retval = current->signal->rlim[RLIMIT_NOFILE].rlim_cur;
 		goto out;
 
 	case 5:
diff --git a/arch/ppc64/mm/mmap.c b/arch/ppc64/mm/mmap.c
index f90dd1f7ab56..fe65f522aff3 100644
--- a/arch/ppc64/mm/mmap.c
+++ b/arch/ppc64/mm/mmap.c
@@ -37,7 +37,7 @@
 
 static inline unsigned long mmap_base(void)
 {
-	unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur;
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
@@ -58,7 +58,7 @@ static inline int mmap_is_legacy(void)
 	if (current->personality & ADDR_COMPAT_LAYOUT)
 		return 1;
 
-	if (current->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
 		return 1;
 
 	return sysctl_legacy_va_layout;
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 1039196ef053..ebe73afa774b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -37,7 +37,7 @@
 
 static inline unsigned long mmap_base(void)
 {
-	unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur;
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
@@ -58,7 +58,7 @@ static inline int mmap_is_legacy(void)
 #endif
 	return sysctl_legacy_va_layout ||
 	    (current->personality & ADDR_COMPAT_LAYOUT) ||
-	    current->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY;
+	    current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY;
 }
 
 /*
diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c
index b248f057618c..ad049c1f374b 100644
--- a/arch/sparc/kernel/sys_sunos.c
+++ b/arch/sparc/kernel/sys_sunos.c
@@ -178,7 +178,7 @@ asmlinkage int sunos_brk(unsigned long brk)
 	 * Check against rlimit and stack..
 	 */
 	retval = -ENOMEM;
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (brk - current->mm->end_code > rlim)
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index 3dbf2660c24d..0cad0a4328ce 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -102,12 +102,12 @@ static int aout32_core_dump(long signr, struct pt_regs *regs, struct file *file)
 /* If the size of the dump file exceeds the rlimit, then see what would happen
    if we wrote the stack, but not the data area.  */
 	if ((dump.u_dsize+dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 
 /* Make sure we have enough room to write the stack and data areas. */
 	if ((dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 
 /* make sure we actually have a data and stack area to dump */
@@ -218,7 +218,7 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
index ace96ee27879..1bf38a5eaff7 100644
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ b/arch/sparc64/kernel/sys_sunos32.c
@@ -142,7 +142,7 @@ asmlinkage int sunos_brk(u32 baddr)
 	}
 	/* Check against rlimit and stack.. */
 	retval = -ENOMEM;
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (brk - current->mm->end_code > rlim)
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
index a237c68acb41..d7c99fa89661 100644
--- a/arch/sparc64/solaris/fs.c
+++ b/arch/sparc64/solaris/fs.c
@@ -600,23 +600,23 @@ asmlinkage int solaris_ulimit(int cmd, int val)
 {
 	switch (cmd) {
 	case 1: /* UL_GETFSIZE - in 512B chunks */
-		return current->rlim[RLIMIT_FSIZE].rlim_cur >> 9;
+		return current->signal->rlim[RLIMIT_FSIZE].rlim_cur >> 9;
 	case 2: /* UL_SETFSIZE */
 		if ((unsigned long)val > (LONG_MAX>>9)) return -ERANGE;
 		val <<= 9;
-		lock_kernel();
-		if (val > current->rlim[RLIMIT_FSIZE].rlim_max) {
+		task_lock(current->group_leader);
+		if (val > current->signal->rlim[RLIMIT_FSIZE].rlim_max) {
 			if (!capable(CAP_SYS_RESOURCE)) {
-				unlock_kernel();
+				task_unlock(current->group_leader);
 				return -EPERM;
 			}
-			current->rlim[RLIMIT_FSIZE].rlim_max = val;
+			current->signal->rlim[RLIMIT_FSIZE].rlim_max = val;
 		}
-		current->rlim[RLIMIT_FSIZE].rlim_cur = val;
-		unlock_kernel();
+		current->signal->rlim[RLIMIT_FSIZE].rlim_cur = val;
+		task_unlock(current->group_leader);
 		return 0;
 	case 3: /* UL_GMEMLIM */
-		return current->rlim[RLIMIT_DATA].rlim_cur;
+		return current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	case 4: /* UL_GDESLIM */
 		return NR_OPEN;
 	}
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 040e56f759e7..5c7a2c77fd21 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -168,12 +168,12 @@ static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
 /* If the size of the dump file exceeds the rlimit, then see what would happen
    if we wrote the stack, but not the data area.  */
 	if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 
 /* Make sure we have enough room to write the stack and data areas. */
 	if ((dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 
 /* make sure we actually have a data and stack area to dump */
@@ -281,7 +281,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 3d99e70d205a..81dad9b3e757 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -118,22 +118,22 @@ static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
    if we wrote the stack, but not the data area.  */
 #ifdef __sparc__
 	if ((dump.u_dsize+dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 #else
 	if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_dsize = 0;
 #endif
 
 /* Make sure we have enough room to write the stack and data areas. */
 #ifdef __sparc__
 	if ((dump.u_ssize) >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 #else
 	if ((dump.u_ssize+1) * PAGE_SIZE >
-	    current->rlim[RLIMIT_CORE].rlim_cur)
+	    current->signal->rlim[RLIMIT_CORE].rlim_cur)
 		dump.u_ssize = 0;
 #endif
 
@@ -278,7 +278,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5f492733d625..055c9a0e183e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1314,7 +1314,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
 	off_t offset = 0, dataoff;
-	unsigned long limit = current->rlim[RLIMIT_CORE].rlim_cur;
+	unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote;
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 64fea54db0d6..fb440602cfc1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -486,7 +486,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (data_len + bss_len > rlim)
diff --git a/fs/buffer.c b/fs/buffer.c
index 81f31297b81e..01c725823f19 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2232,7 +2232,7 @@ int generic_cont_expand(struct inode *inode, loff_t size)
 	int err;
 
 	err = -EFBIG;
-        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
 		send_sig(SIGXFSZ, current, 0);
 		goto out;
diff --git a/fs/exec.c b/fs/exec.c
index bdf32393b3c5..55f7e05979f2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -377,7 +377,7 @@ int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
 	bprm->p = PAGE_SIZE * i - offset;
 
 	/* Limit stack size to 1GB */
-	stack_base = current->rlim[RLIMIT_STACK].rlim_max;
+	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
 	if (stack_base > (1 << 30))
 		stack_base = 1 << 30;
 	stack_base = PAGE_ALIGN(STACK_TOP - stack_base);
@@ -1393,7 +1393,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	current->signal->group_exit_code = exit_code;
 	coredump_wait(mm);
 
-	if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
+	if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
 		goto fail_unlock;
 
 	/*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index eee115d6a224..0287312d03b3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -86,7 +86,7 @@ static int locate_fd(struct files_struct *files,
 	int error;
 
 	error = -EINVAL;
-	if (orig_start >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 repeat:
@@ -105,7 +105,7 @@ repeat:
 	}
 	
 	error = -EMFILE;
-	if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 	error = expand_files(files, newfd);
@@ -161,7 +161,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	if (newfd == oldfd)
 		goto out_unlock;
 	err = -EBADF;
-	if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out_unlock;
 	get_file(file);			/* We are now finished with oldfd */
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 21c3c4d396be..d6862fe4df4b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -545,7 +545,7 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count,
 {
 	ssize_t retval = -EINVAL;
 	loff_t *ppos = &iocb->ki_pos;
-	unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	struct file *file = iocb->ki_filp;
 	struct nfs_open_context *ctx =
 			(struct nfs_open_context *) file->private_data;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 8a06919c7af3..fe03e31f20b9 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -180,7 +180,6 @@ nfsd(struct svc_rqst *rqstp)
 	/* Lock module and set up kernel thread */
 	lock_kernel();
 	daemonize("nfsd");
-	current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 
 	/* After daemonize() this kernel thread shares current->fs
 	 * with the init process. We need to create files with a
diff --git a/fs/open.c b/fs/open.c
index 8e539937ceca..817bef771912 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -852,7 +852,7 @@ repeat:
 	 * N.B. For clone tasks sharing a files structure, this test
 	 * will limit the total number of files that can be opened.
 	 */
-	if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;
 
 	/* Do we need to expand the fdset array? */
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 272908775622..d7ee3a838e3f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -313,6 +313,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	struct mm_struct *mm;
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0, cutime = 0, cstime = 0;
+	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 
 	state = *get_task_state(task);
@@ -347,6 +348,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		cmaj_flt = task->signal->cmaj_flt;
 		cutime = task->signal->cutime;
 		cstime = task->signal->cstime;
+		rsslim = task->signal->rlim[RLIMIT_RSS].rlim_cur;
 	}
 	read_unlock(&tasklist_lock);
 
@@ -393,7 +395,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		start_time,
 		vsize,
 		mm ? mm->rss : 0, /* you might want to shift this left 3 */
-		task->rlim[RLIMIT_RSS].rlim_cur,
+	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
 		mm ? mm->start_stack : 0,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9937c8df8d7c..803d8efb1c4a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,6 +50,7 @@
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}}, \
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
+	.rlim		= INIT_RLIMITS,					\
 }
 
 #define INIT_SIGHAND(sighand) {	\
@@ -96,7 +97,6 @@ extern struct group_info init_groups;
 	.cap_inheritable = CAP_INIT_INH_SET,				\
 	.cap_permitted	= CAP_FULL_SET,					\
 	.keep_capabilities = 0,						\
-	.rlim		= INIT_RLIMITS,					\
 	.user		= INIT_USER,					\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65ff5b5e896a..158ee1c501f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -540,7 +540,7 @@ static inline int can_do_mlock(void)
 {
 	if (capable(CAP_IPC_LOCK))
 		return 1;
-	if (current->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+	if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
 		return 1;
 	return 0;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8810b551082a..389a70ebb189 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -312,6 +312,17 @@ struct signal_struct {
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+
+	/*
+	 * We don't bother to synchronize most readers of this at all,
+	 * because there is no reader checking a limit that actually needs
+	 * to get both rlim_cur and rlim_max atomically, and either one
+	 * alone is a single word that can safely be read normally.
+	 * getrlimit/setrlimit use task_lock(current->group_leader) to
+	 * protect this instead of the siglock, because they really
+	 * have no need to disable irqs.
+	 */
+	struct rlimit rlim[RLIM_NLIMITS];
 };
 
 /*
@@ -518,8 +529,6 @@ struct task_struct {
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
 	unsigned keep_capabilities:1;
 	struct user_struct *user;
-/* limits */
-	struct rlimit rlim[RLIM_NLIMITS];
 	unsigned short used_math;
 	char comm[16];
 /* file system info */
diff --git a/include/linux/security.h b/include/linux/security.h
index 983d7c2265bc..a1dee9a60587 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -582,7 +582,7 @@ struct swap_info_struct;
  * @task_setrlimit:
  *	Check permission before setting the resource limits of the current
  *	process for @resource to @new_rlim.  The old resource limit values can
- *	be examined by dereferencing (current->rlim + resource).
+ *	be examined by dereferencing (current->signal->rlim + resource).
  *	@resource contains the resource whose limit is being set.
  *	@new_rlim contains the new limits for @resource.
  *	Return 0 if permission is granted.
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index aee2696c30b1..5cfcd4b68cb3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -145,7 +145,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 			spin_lock(&mq_lock);
 			if (u->mq_bytes + mq_bytes < u->mq_bytes ||
 		 	    u->mq_bytes + mq_bytes >
-			    p->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
+			    p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
 				spin_unlock(&mq_lock);
 				goto out_inode;
 			}
diff --git a/kernel/acct.c b/kernel/acct.c
index fb6989a34f6e..52d49907c0f7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,11 +488,11 @@ static void do_acct_process(long exitcode, struct file *file)
 	/*
  	 * Accounting records are not subject to resource limits.
  	 */
-	flim = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	file->f_op->write(file, (char *)&ac,
 			       sizeof(acct_t), &file->f_pos);
-	current->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 	set_fs(fs);
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ec1f96fa92b..e018bf6169d2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -237,7 +237,8 @@ void reparent_to_init(void)
 	/* rt_priority? */
 	/* signals? */
 	security_task_reparent_to_init(current);
-	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+	memcpy(current->signal->rlim, init_task.signal->rlim,
+	       sizeof(current->signal->rlim));
 	atomic_inc(&(INIT_USER->__count));
 	switch_uid(INIT_USER);
 
@@ -761,7 +762,6 @@ static void exit_notify(struct task_struct *tsk)
 	 */
 	tsk->it_virt_value = 0;
 	tsk->it_prof_value = 0;
-	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
 
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 70f604c3937b..6fd57c2d22cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -249,8 +249,8 @@ void __init fork_init(unsigned long mempages)
 	if(max_threads < 20)
 		max_threads = 20;
 
-	init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
-	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -872,6 +872,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 
+	task_lock(current->group_leader);
+	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+	task_unlock(current->group_leader);
+
 	return 0;
 }
 
@@ -941,7 +945,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	retval = -EAGAIN;
 	if (atomic_read(&p->user->processes) >=
-			p->rlim[RLIMIT_NPROC].rlim_cur) {
+			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
 				p->user != &root_user)
 			goto bad_fork_free;
diff --git a/kernel/signal.c b/kernel/signal.c
index df2cd4216838..44c0223cca3e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -269,7 +269,7 @@ static struct sigqueue *__sigqueue_alloc(void)
 	struct sigqueue *q = NULL;
 
 	if (atomic_read(&current->user->sigpending) <
-			current->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			current->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
 	if (q) {
 		INIT_LIST_HEAD(&q->list);
@@ -764,7 +764,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	if (atomic_read(&t->user->sigpending) <
-			t->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
 
 	if (q) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 571bba8c8989..e2b57d6f6038 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -649,7 +649,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
-				current->rlim[RLIMIT_NPROC].rlim_cur &&
+				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != &root_user) {
 		free_uid(new_user);
 		return -EAGAIN;
@@ -1496,9 +1496,13 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	else
-		return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim))
-			? -EFAULT : 0;
+	else {
+		struct rlimit value;
+		task_lock(current->group_leader);
+		value = current->signal->rlim[resource];
+		task_unlock(current->group_leader);
+		return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+	}
 }
 
 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1513,7 +1517,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
 
-	memcpy(&x, current->rlim + resource, sizeof(*rlim));
+	task_lock(current->group_leader);
+	x = current->signal->rlim[resource];
+	task_unlock(current->group_leader);
 	if(x.rlim_cur > 0x7FFFFFFF)
 		x.rlim_cur = 0x7FFFFFFF;
 	if(x.rlim_max > 0x7FFFFFFF)
@@ -1534,21 +1540,20 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 		return -EFAULT;
        if (new_rlim.rlim_cur > new_rlim.rlim_max)
                return -EINVAL;
-	old_rlim = current->rlim + resource;
-	if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
-	     (new_rlim.rlim_max > old_rlim->rlim_max)) &&
+	old_rlim = current->signal->rlim + resource;
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE) {
-		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
-	}
 
 	retval = security_task_setrlimit(resource, &new_rlim);
 	if (retval)
 		return retval;
 
+	task_lock(current->group_leader);
 	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
 	return 0;
 }
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 15182647509d..9616e707154e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -805,12 +805,13 @@ static inline void do_process_times(struct task_struct *p,
 
 	psecs = (p->utime += user);
 	psecs += (p->stime += system);
-	if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) {
+	if (!unlikely(p->state & (TASK_DEAD|TASK_ZOMBIE)) &&
+	    psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
 		/* Send SIGXCPU every second.. */
 		if (!(psecs % HZ))
 			send_sig(SIGXCPU, p, 1);
 		/* and SIGKILL when we go over max.. */
-		if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
+		if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max)
 			send_sig(SIGKILL, p, 1);
 	}
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 272c3e0a6fed..ba5b903ff585 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1804,7 +1804,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
 {
 	struct inode *inode = file->f_mapping->host;
-	unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 
         if (unlikely(*pos < 0))
                 return -EINVAL;
diff --git a/mm/memory.c b/mm/memory.c
index 0a7013a26019..f10dc9bb7fe2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1236,7 +1236,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
 	goto out_truncate;
 
 do_expand:
-	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && offset > limit)
 		goto out_sig;
 	if (offset > inode->i_sb->s_maxbytes)
diff --git a/mm/mlock.c b/mm/mlock.c
index 873245bc9697..662a52109d36 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -115,7 +115,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
 	locked = len >> PAGE_SHIFT;
 	locked += current->mm->locked_vm;
 
-	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 
 	/* check against resource limits */
@@ -176,7 +176,7 @@ asmlinkage long sys_mlockall(int flags)
 
 	down_write(&current->mm->mmap_sem);
 
-	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 
 	ret = -ENOMEM;
@@ -211,7 +211,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
 
 	spin_lock(&shmlock_user_lock);
 	locked = size >> PAGE_SHIFT;
-	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 	if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
 		goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index 8ccbb00c208f..7e2f336cfbbf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -136,7 +136,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	}
 
 	/* Check against rlimit.. */
-	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 		goto out;
 
@@ -833,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = mm->locked_vm << PAGE_SHIFT;
-		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += len;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
@@ -905,7 +905,7 @@ munmap_back:
 
 	/* Check against address space limit. */
 	if ((mm->total_vm << PAGE_SHIFT) + len
-	    > current->rlim[RLIMIT_AS].rlim_cur)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;
 
 	if (accountable && (!(flags & MAP_NORESERVE) ||
@@ -1350,9 +1350,9 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
 		return -ENOMEM;
 	}
 	
-	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
+	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
-			current->rlim[RLIMIT_AS].rlim_cur) {
+			current->signal->rlim[RLIMIT_AS].rlim_cur) {
 		anon_vma_unlock(vma);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
@@ -1412,9 +1412,9 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
 		return -ENOMEM;
 	}
 	
-	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
+	if (vma->vm_end - address > current->signal->rlim[RLIMIT_STACK].rlim_cur ||
 			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
-			current->rlim[RLIMIT_AS].rlim_cur) {
+			current->signal->rlim[RLIMIT_AS].rlim_cur) {
 		anon_vma_unlock(vma);
 		vm_unacct_memory(grow);
 		return -ENOMEM;
@@ -1760,7 +1760,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (mm->def_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = mm->locked_vm << PAGE_SHIFT;
-		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += len;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
@@ -1779,7 +1779,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Check against address space limits *after* clearing old maps... */
 	if ((mm->total_vm << PAGE_SHIFT) + len
-	    > current->rlim[RLIMIT_AS].rlim_cur)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 		return -ENOMEM;
 
 	if (mm->map_count > sysctl_max_map_count)
diff --git a/mm/mremap.c b/mm/mremap.c
index 0b0c0e27ecb5..558830585bac 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -327,7 +327,7 @@ unsigned long do_mremap(unsigned long addr,
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = current->mm->locked_vm << PAGE_SHIFT;
-		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+		lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += new_len - old_len;
 		ret = -EAGAIN;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
@@ -335,7 +335,7 @@ unsigned long do_mremap(unsigned long addr,
 	}
 	ret = -ENOMEM;
 	if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
-	    > current->rlim[RLIMIT_AS].rlim_cur)
+	    > current->signal->rlim[RLIMIT_AS].rlim_cur)
 		goto out;
 
 	if (vma->vm_flags & VM_ACCOUNT) {
diff --git a/mm/nommu.c b/mm/nommu.c
index dd4b66b8c9a6..9cac9ae08bb0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,7 +57,7 @@ int vmtruncate(struct inode *inode, loff_t offset)
 	goto out_truncate;
 
 do_expand:
-	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
 	if (limit != RLIM_INFINITY && offset > limit)
 		goto out_sig;
 	if (offset > inode->i_sb->s_maxbytes)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f844e402b53c..349d54dc71c0 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1909,8 +1909,8 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 				  PROCESS__RLIMITINH, NULL, NULL);
 		if (rc) {
 			for (i = 0; i < RLIM_NLIMITS; i++) {
-				rlim = current->rlim + i;
-				initrlim = init_task.rlim+i;
+				rlim = current->signal->rlim + i;
+				initrlim = init_task.signal->rlim+i;
 				rlim->rlim_cur = min(rlim->rlim_max,initrlim->rlim_cur);
 			}
 		}
@@ -2699,7 +2699,7 @@ static int selinux_task_setnice(struct task_struct *p, int nice)
 
 static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
 {
-	struct rlimit *old_rlim = current->rlim + resource;
+	struct rlimit *old_rlim = current->signal->rlim + resource;
 	int rc;
 
 	rc = secondary_ops->task_setrlimit(resource, new_rlim);
-- 
cgit v1.2.3


From 2dbc57298d84fe37d855c8bfa3b280dfd47ded4b Mon Sep 17 00:00:00 2001
From: Gregory Kurz <gkurz@meiosys.com>
Date: Mon, 18 Oct 2004 08:55:24 -0700
Subject: [PATCH] fork() bug invalidates file descriptors

Take a process P1 that spawns a thread T (aka.  a clone with CLONE_FILES).
If P1 forks another process P2 (aka.  not a clone) while T is blocked in a
open() that should return file descriptor FD, then FD will be unusable in
P2.  This leads to strange behaviors in the context of P2: close(FD)
returns EBADF, while dup2(a_valid_fd, FD) returns EBUSY and of course FD is
never returned again by any syscall...

testcase:

#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <asm/page.h>

#define FIFO "/tmp/bug_fifo"
#define FD   0

/*
 * This program is meant to show that calling fork() while a clone spawned
 * with CLONE_FILES is blocked in open() makes a fd number unusable in the
 * child.
 *
 *
 *     Parent               Clone                Child
 *        |
 *   clone(CLONE_FILES)-
---
 kernel/fork.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6fd57c2d22cc..bd33d8a507d7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -762,8 +762,17 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
-		if (f)
+		if (f) {
 			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, newf->open_fds);
+		}
 		*new_fds++ = f;
 	}
 	spin_unlock(&oldf->file_lock);
-- 
cgit v1.2.3


From d3069b4dd0767b4e24debdb21b632b2f8dd72474 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 18 Oct 2004 09:12:06 -0700
Subject: [PATCH] fix & clean up zombie/dead task handling & preemption

This patch fixes all the preempt-after-task->state-is-TASK_DEAD problems we
had.  Right now, the moment procfs does a down() that sleeps in
proc_pid_flush() [it could] our TASK_DEAD state is zapped and we might be
back to TASK_RUNNING to and we trigger this assert:

        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
        for (;;) ;

I have split out TASK_ZOMBIE and TASK_DEAD into a separate p->exit_state
field, to allow the detaching of exit-signal/parent/wait-handling from
descheduling a dead task.  Dead-task freeing is done via PF_DEAD.

Tested the patch on x86 SMP and UP, but all architectures should work
fine.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/ptrace.c         |  2 +-
 arch/arm/kernel/ptrace.c           |  2 +-
 arch/arm26/kernel/ptrace.c         |  2 +-
 arch/cris/arch-v10/kernel/ptrace.c |  2 +-
 arch/h8300/kernel/ptrace.c         |  2 +-
 arch/i386/kernel/ptrace.c          |  2 +-
 arch/ia64/kernel/perfmon.c         |  2 +-
 arch/ia64/kernel/ptrace.c          |  2 +-
 arch/m32r/kernel/ptrace.c          |  2 +-
 arch/m68k/kernel/ptrace.c          |  2 +-
 arch/m68knommu/kernel/ptrace.c     |  2 +-
 arch/mips/kernel/ptrace.c          |  2 +-
 arch/mips/kernel/ptrace32.c        |  2 +-
 arch/parisc/kernel/ptrace.c        |  2 +-
 arch/ppc/kernel/ptrace.c           |  2 +-
 arch/ppc64/kernel/ptrace.c         |  2 +-
 arch/ppc64/kernel/ptrace32.c       |  2 +-
 arch/s390/kernel/ptrace.c          |  2 +-
 arch/sh/kernel/ptrace.c            |  2 +-
 arch/sh64/kernel/ptrace.c          |  2 +-
 arch/sparc/kernel/ptrace.c         |  2 +-
 arch/sparc64/kernel/ptrace.c       |  2 +-
 arch/um/kernel/ptrace.c            |  2 +-
 arch/um/kernel/tt/process_kern.c   |  6 +--
 arch/v850/kernel/ptrace.c          |  2 +-
 arch/x86_64/kernel/ptrace.c        |  2 +-
 fs/exec.c                          |  8 ++--
 fs/proc/array.c                    | 14 +++----
 include/linux/sched.h              |  5 ++-
 kernel/exit.c                      | 85 +++++++++++++++++++++-----------------
 kernel/fork.c                      |  3 +-
 kernel/power/process.c             |  4 +-
 kernel/ptrace.c                    |  2 +-
 kernel/sched.c                     | 12 +++---
 kernel/signal.c                    |  4 +-
 kernel/timer.c                     |  2 +-
 36 files changed, 104 insertions(+), 91 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 9c21fbd4f34f..d00583161574 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -355,7 +355,7 @@ do_sys_ptrace(long request, long pid, long addr, long data,
 	 */
 	case PTRACE_KILL:
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)
+		if (child->exit_state == EXIT_ZOMBIE)
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure single-step breakpoint is gone. */
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 7aeb80f8551a..c82f70b5d66c 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -677,7 +677,7 @@ static int do_ptrace(int request, struct task_struct *child, long addr, long dat
 			/* make sure single-step breakpoint is gone. */
 			child->ptrace &= ~PT_SINGLESTEP;
 			ptrace_cancel_bpt(child);
-			if (child->state != TASK_ZOMBIE) {
+			if (child->exit_state != EXIT_ZOMBIE) {
 				child->exit_code = SIGKILL;
 				wake_up_process(child);
 			}
diff --git a/arch/arm26/kernel/ptrace.c b/arch/arm26/kernel/ptrace.c
index e2ffb2007465..8597ab3574fe 100644
--- a/arch/arm26/kernel/ptrace.c
+++ b/arch/arm26/kernel/ptrace.c
@@ -614,7 +614,7 @@ static int do_ptrace(int request, struct task_struct *child, long addr, long dat
 			/* make sure single-step breakpoint is gone. */
 			child->ptrace &= ~PT_SINGLESTEP;
 			ptrace_cancel_bpt(child);
-			if (child->state != TASK_ZOMBIE) {
+			if (child->exit_state != EXIT_ZOMBIE) {
 				child->exit_code = SIGKILL;
 				wake_up_process(child);
 			}
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index 3c7e0a51a282..55fd631224b3 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -176,7 +176,7 @@ sys_ptrace(long request, long pid, long addr, long data)
 		case PTRACE_KILL:
 			ret = 0;
 			
-			if (child->state == TASK_ZOMBIE)
+			if (child->exit_state == EXIT_ZOMBIE)
 				break;
 			
 			child->exit_code = SIGKILL;
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index b60d234d1d27..5f19d774a288 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -192,7 +192,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		case PTRACE_KILL: {
 
 			ret = 0;
-			if (child->state == TASK_ZOMBIE) /* already dead */
+			if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
 			h8300_disable_trace(child);
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index e0c9a03739a8..77c48ce67e08 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -390,7 +390,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		long tmp;
 
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index e86722b7fce7..2685e6ea8bbd 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2593,7 +2593,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 		return -EINVAL;
 	}
 
-	if (task->state == TASK_ZOMBIE) {
+	if (task->exit_state == EXIT_ZOMBIE) {
 		DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
 		return -EBUSY;
 	}
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index d8ff70dfd84b..1e515083e23f 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1422,7 +1422,7 @@ sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data,
 		 * sigkill.  Perhaps it should be put in the status
 		 * that it wants to exit.
 		 */
-		if (child->state == TASK_ZOMBIE)		/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)		/* already dead */
 			goto out_tsk;
 		child->exit_code = SIGKILL;
 
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index ab4137a3398a..6d58f972b35c 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -713,7 +713,7 @@ do_ptrace(long request, struct task_struct *child, long addr, long data)
 		ret = 0;
 		unregister_all_debug_traps(child);
 		invalidate_cache();
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index b043a7f9409f..0beb53333ba3 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -277,7 +277,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			long tmp;
 
 			ret = 0;
-			if (child->state == TASK_ZOMBIE) /* already dead */
+			if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
 	/* make sure the single step bit is not set. */
diff --git a/arch/m68knommu/kernel/ptrace.c b/arch/m68knommu/kernel/ptrace.c
index 773dc91e46e9..15cf79080b15 100644
--- a/arch/m68knommu/kernel/ptrace.c
+++ b/arch/m68knommu/kernel/ptrace.c
@@ -264,7 +264,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			long tmp;
 
 			ret = 0;
-			if (child->state == TASK_ZOMBIE) /* already dead */
+			if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
 			/* make sure the single step bit is not set. */
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 6e8511390a57..1ce7bd150940 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -277,7 +277,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 	 */
 	case PTRACE_KILL:
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index df7a51cceeb0..799919d9beaa 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -262,7 +262,7 @@ asmlinkage int sys32_ptrace(int request, int pid, int addr, int data)
 	 */
 	case PTRACE_KILL:
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 598aadd36aae..a9200a3ae6d8 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -303,7 +303,7 @@ long sys_ptrace(long request, pid_t pid, long addr, long data)
 		 * that it wants to exit.
 		 */
 		DBG(("sys_ptrace(KILL)\n"));
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			goto out_tsk;
 		child->exit_code = SIGKILL;
 		goto out_wake_notrap;
diff --git a/arch/ppc/kernel/ptrace.c b/arch/ppc/kernel/ptrace.c
index 5cf5624e90d1..118d97ed10fd 100644
--- a/arch/ppc/kernel/ptrace.c
+++ b/arch/ppc/kernel/ptrace.c
@@ -377,7 +377,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/ppc64/kernel/ptrace.c b/arch/ppc64/kernel/ptrace.c
index aeead8375a5f..0a4011f58f2f 100644
--- a/arch/ppc64/kernel/ptrace.c
+++ b/arch/ppc64/kernel/ptrace.c
@@ -182,7 +182,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
 	 */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/ppc64/kernel/ptrace32.c b/arch/ppc64/kernel/ptrace32.c
index e0ab44dc1fc4..ee81b1b776cc 100644
--- a/arch/ppc64/kernel/ptrace32.c
+++ b/arch/ppc64/kernel/ptrace32.c
@@ -314,7 +314,7 @@ int sys32_ptrace(long request, long pid, unsigned long addr, unsigned long data)
 	 */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 26b89e55d9c1..1acb2d8932b1 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -626,7 +626,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 		 * perhaps it should be put in the status that it wants to 
 		 * exit.
 		 */
-		if (child->state == TASK_ZOMBIE) /* already dead */
+		if (child->exit_state == EXIT_ZOMBIE) /* already dead */
 			return 0;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
diff --git a/arch/sh/kernel/ptrace.c b/arch/sh/kernel/ptrace.c
index 017826912cc5..d14810a459d9 100644
--- a/arch/sh/kernel/ptrace.c
+++ b/arch/sh/kernel/ptrace.c
@@ -217,7 +217,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/sh64/kernel/ptrace.c b/arch/sh64/kernel/ptrace.c
index c48bc4a27b44..f27c696b3e00 100644
--- a/arch/sh64/kernel/ptrace.c
+++ b/arch/sh64/kernel/ptrace.c
@@ -257,7 +257,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c
index 589652ea05b8..62b8fd1dba7f 100644
--- a/arch/sparc/kernel/ptrace.c
+++ b/arch/sparc/kernel/ptrace.c
@@ -567,7 +567,7 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
  * exit.
  */
 	case PTRACE_KILL: {
-		if (child->state == TASK_ZOMBIE) {	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE) {	/* already dead */
 			pt_succ_return(regs, 0);
 			goto out_tsk;
 		}
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index fd73a3ab8f1f..721ab5ba859f 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -559,7 +559,7 @@ asmlinkage void do_ptrace(struct pt_regs *regs)
  * exit.
  */
 	case PTRACE_KILL: {
-		if (child->state == TASK_ZOMBIE) {	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE) {	/* already dead */
 			pt_succ_return(regs, 0);
 			goto out_tsk;
 		}
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 40912d18cb2f..b4efb669be41 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -158,7 +158,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
  */
 	case PTRACE_KILL: {
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c
index 6224afcd84d7..d547435ee5a5 100644
--- a/arch/um/kernel/tt/process_kern.c
+++ b/arch/um/kernel/tt/process_kern.c
@@ -65,7 +65,7 @@ void *switch_to_tt(void *prev, void *next, void *last)
 		panic("write of switch_pipe failed, err = %d", -err);
 
 	reading = 1;
-	if((from->state == TASK_ZOMBIE) || (from->state == TASK_DEAD))
+	if((from->exit_state == EXIT_ZOMBIE) || (from->exit_state == EXIT_DEAD))
 		os_kill_process(os_getpid(), 0);
 
 	err = os_read_file(from->thread.mode.tt.switch_pipe[0], &c, sizeof(c));
@@ -80,8 +80,8 @@ void *switch_to_tt(void *prev, void *next, void *last)
 	 * in case it has not already killed itself.
 	 */
 	prev_sched = current->thread.prev_sched;
-	if((prev_sched->state == TASK_ZOMBIE) ||
-	   (prev_sched->state == TASK_DEAD))
+	if((prev_sched->exit_state == EXIT_ZOMBIE) ||
+	   (prev_sched->exit_state == EXIT_DEAD))
 		os_kill_process(prev_sched->thread.mode.tt.extern_pid, 1);
 
 	/* This works around a nasty race with 'jail'.  If we are switching
diff --git a/arch/v850/kernel/ptrace.c b/arch/v850/kernel/ptrace.c
index c0ee8f0b7f4c..ab60fa953e74 100644
--- a/arch/v850/kernel/ptrace.c
+++ b/arch/v850/kernel/ptrace.c
@@ -232,7 +232,7 @@ int sys_ptrace(long request, long pid, long addr, long data)
 	 */
 	case PTRACE_KILL:
 		rval = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		child->exit_code = SIGKILL;
 		wake_up_process(child);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 73ff5494a6c1..ebd621e96787 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -395,7 +395,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		long tmp;
 
 		ret = 0;
-		if (child->state == TASK_ZOMBIE)	/* already dead */
+		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
 			break;
 		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
 		child->exit_code = SIGKILL;
diff --git a/fs/exec.c b/fs/exec.c
index 125a04e3692b..e715541b2db4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,14 +632,14 @@ static inline int de_thread(struct task_struct *tsk)
 	if (current->pid != current->tgid) {
 		struct task_struct *leader = current->group_leader, *parent;
 		struct dentry *proc_dentry1, *proc_dentry2;
-		unsigned long state, ptrace;
+		unsigned long exit_state, ptrace;
 
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
-		while (leader->state != TASK_ZOMBIE)
+		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
 		spin_lock(&leader->proc_lock);
@@ -683,7 +683,7 @@ static inline int de_thread(struct task_struct *tsk)
 		list_del(&current->tasks);
 		list_add_tail(&current->tasks, &init_task.tasks);
 		current->exit_signal = SIGCHLD;
-		state = leader->state;
+		exit_state = leader->exit_state;
 
 		write_unlock_irq(&tasklist_lock);
 		spin_unlock(&leader->proc_lock);
@@ -691,7 +691,7 @@ static inline int de_thread(struct task_struct *tsk)
 		proc_pid_flush(proc_dentry1);
 		proc_pid_flush(proc_dentry2);
 
-		if (state != TASK_ZOMBIE)
+		if (exit_state != EXIT_ZOMBIE)
 			BUG();
 		release_task(leader);
         }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d05dab0697d3..5c289c73a00e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -137,13 +137,13 @@ static const char *task_state_array[] = {
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
-	unsigned int state = tsk->state & (TASK_RUNNING |
-					   TASK_INTERRUPTIBLE |
-					   TASK_UNINTERRUPTIBLE |
-					   TASK_ZOMBIE |
-					   TASK_DEAD |
-					   TASK_STOPPED |
-					   TASK_TRACED);
+	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_INTERRUPTIBLE |
+					    TASK_UNINTERRUPTIBLE |
+					    TASK_STOPPED |
+					    TASK_TRACED)) |
+			(tsk->exit_state & (EXIT_ZOMBIE |
+					    EXIT_DEAD));
 	const char **p = &task_state_array[0];
 
 	while (state) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69648ed7fd2b..dc3f297a726d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -108,8 +108,8 @@ extern unsigned long nr_iowait(void);
 #define TASK_UNINTERRUPTIBLE	2
 #define TASK_STOPPED		4
 #define TASK_TRACED		8
-#define TASK_ZOMBIE		16
-#define TASK_DEAD		32
+#define EXIT_ZOMBIE		16
+#define EXIT_DEAD		32
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -564,6 +564,7 @@ struct task_struct {
 
 /* task state */
 	struct linux_binfmt *binfmt;
+	long exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
diff --git a/kernel/exit.c b/kernel/exit.c
index 031e17486eee..55d853392524 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -76,7 +76,7 @@ repeat:
 	 */
 	zap_leader = 0;
 	leader = p->group_leader;
-	if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(leader->exit_signal == -1);
 		do_notify_parent(leader, leader->exit_signal);
 		/*
@@ -158,7 +158,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
-				|| p->state >= TASK_ZOMBIE 
+				|| p->exit_state >= EXIT_ZOMBIE
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
@@ -519,7 +519,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
 	 * Make sure we're not reparenting to ourselves and that
 	 * the parent is not a zombie.
 	 */
-	BUG_ON(p == reaper || reaper->state >= TASK_ZOMBIE);
+	BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE);
 	p->real_parent = reaper;
 	if (p->parent == p->real_parent)
 		BUG();
@@ -554,7 +554,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		/* If we'd notified the old parent about this child's death,
 		 * also notify the new parent.
 		 */
-		if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+		if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
 		    thread_group_empty(p))
 			do_notify_parent(p, p->exit_signal);
 		else if (p->state == TASK_TRACED) {
@@ -602,7 +602,7 @@ static inline void forget_original_parent(struct task_struct * father,
 			reaper = child_reaper;
 			break;
 		}
-	} while (reaper->state >= TASK_ZOMBIE);
+	} while (reaper->exit_state >= EXIT_ZOMBIE);
 
 	/*
 	 * There are only two places where our children can be:
@@ -628,7 +628,7 @@ static inline void forget_original_parent(struct task_struct * father,
 		} else {
 			/* reparent ptraced task to its real parent */
 			__ptrace_unlink (p);
-			if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
 			    thread_group_empty(p))
 				do_notify_parent(p, p->exit_signal);
 		}
@@ -639,7 +639,7 @@ static inline void forget_original_parent(struct task_struct * father,
 		 * zombie forever since we prevented it from self-reap itself
 		 * while it was being traced by us, to be able to see it in wait4.
 		 */
-		if (unlikely(ptrace && p->state == TASK_ZOMBIE && p->exit_signal == -1))
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
 			list_add(&p->ptrace_list, to_release);
 	}
 	list_for_each_safe(_p, _n, &father->ptrace_children) {
@@ -752,10 +752,10 @@ static void exit_notify(struct task_struct *tsk)
 		do_notify_parent(tsk, SIGCHLD);
 	}
 
-	state = TASK_ZOMBIE;
+	state = EXIT_ZOMBIE;
 	if (tsk->exit_signal == -1 && tsk->ptrace == 0)
-		state = TASK_DEAD;
-	tsk->state = state;
+		state = EXIT_DEAD;
+	tsk->exit_state = state;
 
 	/*
 	 * Clear these here so that update_process_times() won't try to deliver
@@ -773,7 +773,7 @@ static void exit_notify(struct task_struct *tsk)
 	}
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == TASK_DEAD)
+	if (state == EXIT_DEAD)
 		release_task(tsk);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
@@ -830,6 +830,8 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
+
+	BUG_ON(!(current->flags & PF_DEAD));
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
@@ -973,7 +975,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
 }
 
 /*
- * Handle sys_wait4 work for one task in state TASK_ZOMBIE.  We hold
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
@@ -992,7 +994,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 		int exit_code = p->exit_code;
 		int why, status;
 
-		if (unlikely(p->state != TASK_ZOMBIE))
+		if (unlikely(p->exit_state != EXIT_ZOMBIE))
 			return 0;
 		if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
 			return 0;
@@ -1013,9 +1015,9 @@ static int wait_task_zombie(task_t *p, int noreap,
 	 * Try to move the task's state to DEAD
 	 * only one thread is allowed to do this:
 	 */
-	state = xchg(&p->state, TASK_DEAD);
-	if (state != TASK_ZOMBIE) {
-		BUG_ON(state != TASK_DEAD);
+	state = xchg(&p->exit_state, EXIT_DEAD);
+	if (state != EXIT_ZOMBIE) {
+		BUG_ON(state != EXIT_DEAD);
 		return 0;
 	}
 	if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
@@ -1060,7 +1062,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 
 	/*
 	 * Now we are sure this task is interesting, and no other
-	 * thread can reap it because we set its state to TASK_DEAD.
+	 * thread can reap it because we set its state to EXIT_DEAD.
 	 */
 	read_unlock(&tasklist_lock);
 
@@ -1092,7 +1094,8 @@ static int wait_task_zombie(task_t *p, int noreap,
 	if (!retval && infop)
 		retval = put_user(p->uid, &infop->si_uid);
 	if (retval) {
-		p->state = TASK_ZOMBIE;
+		// TODO: is this safe?
+		p->exit_state = EXIT_ZOMBIE;
 		return retval;
 	}
 	retval = p->pid;
@@ -1101,7 +1104,8 @@ static int wait_task_zombie(task_t *p, int noreap,
 		/* Double-check with lock held.  */
 		if (p->real_parent != p->parent) {
 			__ptrace_unlink(p);
-			p->state = TASK_ZOMBIE;
+			// TODO: is this safe?
+			p->exit_state = EXIT_ZOMBIE;
 			/*
 			 * If this is not a detached task, notify the parent.
 			 * If it's still not detached after that, don't release
@@ -1172,13 +1176,13 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
 	/*
 	 * This uses xchg to be atomic with the thread resuming and setting
 	 * it.  It must also be done with the write lock held to prevent a
-	 * race with the TASK_ZOMBIE case.
+	 * race with the EXIT_ZOMBIE case.
 	 */
 	exit_code = xchg(&p->exit_code, 0);
-	if (unlikely(p->state >= TASK_ZOMBIE)) {
+	if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {
 		/*
 		 * The task resumed and then died.  Let the next iteration
-		 * catch it in TASK_ZOMBIE.  Note that exit_code might
+		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
 		 * already be zero here if it resumed and did _exit(0).
 		 * The task itself is dead and won't touch exit_code again;
 		 * other processors in this function are locked out.
@@ -1339,23 +1343,28 @@ repeat:
 				if (retval != 0) /* He released the lock.  */
 					goto end;
 				break;
-			case TASK_ZOMBIE:
-				/*
-				 * Eligible but we cannot release it yet:
-				 */
-				if (ret == 2)
-					goto check_continued;
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(
-					p, (options & WNOWAIT),
-					infop, stat_addr, ru);
-				if (retval != 0) /* He released the lock.  */
-					goto end;
-				break;
-			case TASK_DEAD:
-				continue;
 			default:
+			// case EXIT_DEAD:
+				if (p->exit_state == EXIT_DEAD)
+					continue;
+			// case EXIT_ZOMBIE:
+				if (p->exit_state == EXIT_ZOMBIE) {
+					/*
+					 * Eligible but we cannot release
+					 * it yet:
+					 */
+					if (ret == 2)
+						goto check_continued;
+					if (!likely(options & WEXITED))
+						continue;
+					retval = wait_task_zombie(
+						p, (options & WNOWAIT),
+						infop, stat_addr, ru);
+					/* He released the lock.  */
+					if (retval != 0)
+						goto end;
+					break;
+				}
 check_continued:
 				if (!unlikely(options & WCONTINUED))
 					continue;
diff --git a/kernel/fork.c b/kernel/fork.c
index bd33d8a507d7..3020dccc548f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(free_task);
 
 void __put_task_struct(struct task_struct *tsk)
 {
-	WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE)));
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
@@ -1062,6 +1062,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	/* ok, now we should be set up.. */
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
+	p->exit_state = 0;
 
 	/* Perform scheduler related setup */
 	sched_fork(p);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index bda013de59a5..78d92dc6a1ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -23,8 +23,8 @@ static inline int freezeable(struct task_struct * p)
 {
 	if ((p == current) || 
 	    (p->flags & PF_NOFREEZE) ||
-	    (p->state == TASK_ZOMBIE) ||
-	    (p->state == TASK_DEAD) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD) ||
 	    (p->state == TASK_STOPPED) ||
 	    (p->state == TASK_TRACED))
 		return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 09ba057222c3..c3ac348e2368 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	write_lock_irq(&tasklist_lock);
 	__ptrace_unlink(child);
 	/* .. and wake it up. */
-	if (child->state != TASK_ZOMBIE)
+	if (child->exit_state != EXIT_ZOMBIE)
 		wake_up_process(child);
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 5dc237eb32a5..8e5e2af64509 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1329,10 +1329,10 @@ static void finish_task_switch(task_t *prev)
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
-	 * schedule one last time. The schedule call will never return,
+	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+	 * calls schedule one last time. The schedule call will never return,
 	 * and the scheduled task must drop that reference.
-	 * The test for TASK_ZOMBIE must occur while the runqueue locks are
+	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
@@ -2489,7 +2489,7 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+	if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "scheduling while atomic: "
 				"%s/0x%08x/%d\n",
@@ -2531,6 +2531,8 @@ need_resched:
 
 	spin_lock_irq(&rq->lock);
 
+	if (unlikely(current->flags & PF_DEAD))
+		current->state = EXIT_DEAD;
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
@@ -3920,7 +3922,7 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
 	struct runqueue *rq = cpu_rq(dead_cpu);
 
 	/* Must be exiting, otherwise would be on tasklist. */
-	BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD);
+	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(tsk->flags & PF_DEAD);
diff --git a/kernel/signal.c b/kernel/signal.c
index 9f6bd72bdd82..f67390806d73 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -914,7 +914,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 	 * Don't bother zombies and stopped tasks (but
 	 * SIGKILL will punch through stopped state)
 	 */
-	mask = TASK_DEAD | TASK_ZOMBIE | TASK_TRACED;
+	mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED;
 	if (sig != SIGKILL)
 		mask |= TASK_STOPPED;
 
@@ -1070,7 +1070,7 @@ void zap_other_threads(struct task_struct *p)
 		/*
 		 * Don't bother with already dead threads
 		 */
-		if (t->state & (TASK_ZOMBIE|TASK_DEAD))
+		if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD))
 			continue;
 
 		/*
diff --git a/kernel/timer.c b/kernel/timer.c
index 0a390a67c0f2..e3c9b5fcd52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -805,7 +805,7 @@ static inline void do_process_times(struct task_struct *p,
 
 	psecs = (p->utime += user);
 	psecs += (p->stime += system);
-	if (!unlikely(p->state & (TASK_DEAD|TASK_ZOMBIE)) &&
+	if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) &&
 	    psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
 		/* Send SIGXCPU every second.. */
 		if (!(psecs % HZ))
-- 
cgit v1.2.3