summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <greg@kroah.com>2004-10-18 18:03:51 -0700
committerGreg Kroah-Hartman <greg@kroah.com>2004-10-18 18:03:51 -0700
commitbffe01870598b7a0a77073e25ee94e026bc98e6b (patch)
treef5f5b65fec1239b18a4c2634807ff5d7e5729de9 /kernel
parent23aebb6f8755121394ef088d84a7fa483b444aa9 (diff)
parenta4946826c30c56a5830326552a395c5b6afc13ef (diff)
merge
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cpu.c21
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c205
-rw-r--r--kernel/fork.c29
-rw-r--r--kernel/irq/Makefile4
-rw-r--r--kernel/irq/autoprobe.c188
-rw-r--r--kernel/irq/handle.c204
-rw-r--r--kernel/irq/internals.h18
-rw-r--r--kernel/irq/manage.c347
-rw-r--r--kernel/irq/proc.c156
-rw-r--r--kernel/irq/spurious.c96
-rw-r--r--kernel/itimer.c1
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/posix-timers.c6
-rw-r--r--kernel/power/console.c3
-rw-r--r--kernel/power/disk.c18
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/swsusp.c29
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c7
-rw-r--r--kernel/sched.c578
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/sysctl.c1
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/timer.c30
32 files changed, 1512 insertions, 516 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 93155e52bb54..c4337751cee7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index daf23c4efab4..15ab0324139d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -53,6 +53,7 @@
#include <linux/vfs.h>
#include <linux/jiffies.h>
#include <linux/times.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
@@ -384,6 +385,8 @@ static void do_acct_process(long exitcode, struct file *file)
unsigned long vsize;
unsigned long flim;
u64 elapsed;
+ u64 run_time;
+ struct timespec uptime;
/*
* First check to see if there is enough free_space to continue
@@ -401,7 +404,13 @@ static void do_acct_process(long exitcode, struct file *file)
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
- elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+ /* calculate run_time in nsec*/
+ do_posix_clock_monotonic_gettime(&uptime);
+ run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+ run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+ + current->start_time.tv_nsec;
+ /* convert nsec -> AHZ */
+ elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION==3
ac.ac_etime = encode_float(elapsed);
#else
@@ -480,11 +489,11 @@ static void do_acct_process(long exitcode, struct file *file)
/*
* Accounting records are not subject to resource limits.
*/
- flim = current->rlim[RLIMIT_FSIZE].rlim_cur;
- current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
file->f_op->write(file, (char *)&ac,
sizeof(acct_t), &file->f_pos);
- current->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 7e864e2ccf6a..7800a5066c0f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/security.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 083521327e64..719ba42ff5ee 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -61,13 +61,13 @@ static inline void check_for_tasks(int cpu)
* cpu' with certain environment variables set. */
static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
{
- char *argv[3], *envp[5], cpu_str[12], action_str[32];
+ char *argv[3], *envp[6], cpu_str[12], action_str[32], devpath_str[40];
int i;
sprintf(cpu_str, "CPU=%d", cpu);
sprintf(action_str, "ACTION=%s", action);
- /* FIXME: Add DEVPATH. --RR */
-
+ sprintf(devpath_str, "DEVPATH=devices/system/cpu/cpu%d", cpu);
+
i = 0;
argv[i++] = hotplug_path;
argv[i++] = "cpu";
@@ -79,6 +79,7 @@ static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[i++] = cpu_str;
envp[i++] = action_str;
+ envp[i++] = devpath_str;
envp[i] = NULL;
return call_usermodehelper(argv[0], argv, envp, 0);
@@ -123,6 +124,15 @@ int cpu_down(unsigned int cpu)
goto out;
}
+ err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+ (void *)(long)cpu);
+ if (err == NOTIFY_BAD) {
+ printk("%s: attempt to take down CPU %u failed\n",
+ __FUNCTION__, cpu);
+ err = -EINVAL;
+ goto out;
+ }
+
/* Ensure that we are not runnable on dying cpu */
old_allowed = current->cpus_allowed;
tmp = CPU_MASK_ALL;
@@ -131,6 +141,11 @@ int cpu_down(unsigned int cpu)
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
+ /* CPU didn't die: tell everyone. Can't complain. */
+ if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+ (void *)(long)cpu) == NOTIFY_BAD)
+ BUG();
+
err = PTR_ERR(p);
goto out_allowed;
}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 5fba35304459..ad3e5d54e119 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/sched.h>
+#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 6860b509dd11..55d853392524 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,6 +24,7 @@
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -75,7 +76,7 @@ repeat:
*/
zap_leader = 0;
leader = p->group_leader;
- if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+ if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
BUG_ON(leader->exit_signal == -1);
do_notify_parent(leader, leader->exit_signal);
/*
@@ -157,7 +158,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
if (p == ignored_task
- || p->state >= TASK_ZOMBIE
+ || p->exit_state >= EXIT_ZOMBIE
|| p->real_parent->pid == 1)
continue;
if (process_group(p->real_parent) != pgrp
@@ -237,7 +238,8 @@ void reparent_to_init(void)
/* rt_priority? */
/* signals? */
security_task_reparent_to_init(current);
- memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+ memcpy(current->signal->rlim, init_task.signal->rlim,
+ sizeof(current->signal->rlim));
atomic_inc(&(INIT_USER->__count));
switch_uid(INIT_USER);
@@ -517,7 +519,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
* Make sure we're not reparenting to ourselves and that
* the parent is not a zombie.
*/
- BUG_ON(p == reaper || reaper->state >= TASK_ZOMBIE);
+ BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE);
p->real_parent = reaper;
if (p->parent == p->real_parent)
BUG();
@@ -552,7 +554,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
- if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+ if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
else if (p->state == TASK_TRACED) {
@@ -600,7 +602,7 @@ static inline void forget_original_parent(struct task_struct * father,
reaper = child_reaper;
break;
}
- } while (reaper->state >= TASK_ZOMBIE);
+ } while (reaper->exit_state >= EXIT_ZOMBIE);
/*
* There are only two places where our children can be:
@@ -626,7 +628,7 @@ static inline void forget_original_parent(struct task_struct * father,
} else {
/* reparent ptraced task to its real parent */
__ptrace_unlink (p);
- if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+ if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
}
@@ -637,7 +639,7 @@ static inline void forget_original_parent(struct task_struct * father,
* zombie forever since we prevented it from self-reap itself
* while it was being traced by us, to be able to see it in wait4.
*/
- if (unlikely(ptrace && p->state == TASK_ZOMBIE && p->exit_signal == -1))
+ if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
list_add(&p->ptrace_list, to_release);
}
list_for_each_safe(_p, _n, &father->ptrace_children) {
@@ -750,10 +752,10 @@ static void exit_notify(struct task_struct *tsk)
do_notify_parent(tsk, SIGCHLD);
}
- state = TASK_ZOMBIE;
+ state = EXIT_ZOMBIE;
if (tsk->exit_signal == -1 && tsk->ptrace == 0)
- state = TASK_DEAD;
- tsk->state = state;
+ state = EXIT_DEAD;
+ tsk->exit_state = state;
/*
* Clear these here so that update_process_times() won't try to deliver
@@ -761,7 +763,6 @@ static void exit_notify(struct task_struct *tsk)
*/
tsk->it_virt_value = 0;
tsk->it_prof_value = 0;
- tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
write_unlock_irq(&tasklist_lock);
@@ -772,7 +773,7 @@ static void exit_notify(struct task_struct *tsk)
}
/* If the process is dead, release it - nobody will wait for it */
- if (state == TASK_DEAD)
+ if (state == EXIT_DEAD)
release_task(tsk);
/* PF_DEAD causes final put_task_struct after we schedule. */
@@ -829,6 +830,8 @@ asmlinkage NORET_TYPE void do_exit(long code)
mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
+
+ BUG_ON(!(current->flags & PF_DEAD));
schedule();
BUG();
/* Avoid "noreturn function does return". */
@@ -972,7 +975,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
}
/*
- * Handle sys_wait4 work for one task in state TASK_ZOMBIE. We hold
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
@@ -991,7 +994,7 @@ static int wait_task_zombie(task_t *p, int noreap,
int exit_code = p->exit_code;
int why, status;
- if (unlikely(p->state != TASK_ZOMBIE))
+ if (unlikely(p->exit_state != EXIT_ZOMBIE))
return 0;
if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
return 0;
@@ -1012,9 +1015,9 @@ static int wait_task_zombie(task_t *p, int noreap,
* Try to move the task's state to DEAD
* only one thread is allowed to do this:
*/
- state = xchg(&p->state, TASK_DEAD);
- if (state != TASK_ZOMBIE) {
- BUG_ON(state != TASK_DEAD);
+ state = xchg(&p->exit_state, EXIT_DEAD);
+ if (state != EXIT_ZOMBIE) {
+ BUG_ON(state != EXIT_DEAD);
return 0;
}
if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
@@ -1059,7 +1062,7 @@ static int wait_task_zombie(task_t *p, int noreap,
/*
* Now we are sure this task is interesting, and no other
- * thread can reap it because we set its state to TASK_DEAD.
+ * thread can reap it because we set its state to EXIT_DEAD.
*/
read_unlock(&tasklist_lock);
@@ -1091,7 +1094,8 @@ static int wait_task_zombie(task_t *p, int noreap,
if (!retval && infop)
retval = put_user(p->uid, &infop->si_uid);
if (retval) {
- p->state = TASK_ZOMBIE;
+ // TODO: is this safe?
+ p->exit_state = EXIT_ZOMBIE;
return retval;
}
retval = p->pid;
@@ -1100,7 +1104,8 @@ static int wait_task_zombie(task_t *p, int noreap,
/* Double-check with lock held. */
if (p->real_parent != p->parent) {
__ptrace_unlink(p);
- p->state = TASK_ZOMBIE;
+ // TODO: is this safe?
+ p->exit_state = EXIT_ZOMBIE;
/*
* If this is not a detached task, notify the parent.
* If it's still not detached after that, don't release
@@ -1171,13 +1176,13 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
/*
* This uses xchg to be atomic with the thread resuming and setting
* it. It must also be done with the write lock held to prevent a
- * race with the TASK_ZOMBIE case.
+ * race with the EXIT_ZOMBIE case.
*/
exit_code = xchg(&p->exit_code, 0);
- if (unlikely(p->state >= TASK_ZOMBIE)) {
+ if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {
/*
* The task resumed and then died. Let the next iteration
- * catch it in TASK_ZOMBIE. Note that exit_code might
+ * catch it in EXIT_ZOMBIE. Note that exit_code might
* already be zero here if it resumed and did _exit(0).
* The task itself is dead and won't touch exit_code again;
* other processors in this function are locked out.
@@ -1228,6 +1233,74 @@ bail_ref:
return retval;
}
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(task_t *p, int noreap,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int retval;
+ pid_t pid;
+ uid_t uid;
+
+ if (unlikely(!p->signal))
+ return 0;
+
+ if (p->signal->stop_state >= 0)
+ return 0;
+
+ spin_lock_irq(&p->sighand->siglock);
+ if (p->signal->stop_state >= 0) { /* Re-check with the lock held. */
+ spin_unlock_irq(&p->sighand->siglock);
+ return 0;
+ }
+ if (!noreap)
+ p->signal->stop_state = 0;
+ spin_unlock_irq(&p->sighand->siglock);
+
+ pid = p->pid;
+ uid = p->uid;
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+
+ if (!infop) {
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ put_task_struct(p);
+ if (!retval && stat_addr)
+ retval = put_user(0xffff, stat_addr);
+ if (!retval)
+ retval = p->pid;
+ } else {
+ retval = wait_noreap_copyout(p, pid, uid,
+ CLD_CONTINUED, SIGCONT,
+ infop, ru);
+ BUG_ON(retval == 0);
+ }
+
+ return retval;
+}
+
+
+static inline int my_ptrace_child(struct task_struct *p)
+{
+ if (!(p->ptrace & PT_PTRACED))
+ return 0;
+ if (!(p->ptrace & PT_ATTACHED))
+ return 1;
+ /*
+ * This child was PTRACE_ATTACH'd. We should be seeing it only if
+ * we are the attacher. If we are the real parent, this is a race
+ * inside ptrace_attach. It is waiting for the tasklist_lock,
+ * which we have to switch the parent links, but has already set
+ * the flags in p->ptrace.
+ */
+ return (p->parent != p->real_parent);
+}
+
static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1256,12 +1329,12 @@ repeat:
switch (p->state) {
case TASK_TRACED:
- if (!(p->ptrace & PT_PTRACED))
+ if (!my_ptrace_child(p))
continue;
/*FALLTHROUGH*/
case TASK_STOPPED:
if (!(options & WUNTRACED) &&
- !(p->ptrace & PT_PTRACED))
+ !my_ptrace_child(p))
continue;
retval = wait_task_stopped(p, ret == 2,
(options & WNOWAIT),
@@ -1270,47 +1343,36 @@ repeat:
if (retval != 0) /* He released the lock. */
goto end;
break;
- case TASK_ZOMBIE:
- /*
- * Eligible but we cannot release it yet:
- */
- if (ret == 2)
- goto check_continued;
- if (!likely(options & WEXITED))
- continue;
- retval = wait_task_zombie(
- p, (options & WNOWAIT),
- infop, stat_addr, ru);
- if (retval != 0) /* He released the lock. */
- goto end;
- break;
- case TASK_DEAD:
- continue;
default:
+ // case EXIT_DEAD:
+ if (p->exit_state == EXIT_DEAD)
+ continue;
+ // case EXIT_ZOMBIE:
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /*
+ * Eligible but we cannot release
+ * it yet:
+ */
+ if (ret == 2)
+ goto check_continued;
+ if (!likely(options & WEXITED))
+ continue;
+ retval = wait_task_zombie(
+ p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ /* He released the lock. */
+ if (retval != 0)
+ goto end;
+ break;
+ }
check_continued:
if (!unlikely(options & WCONTINUED))
continue;
- if (unlikely(!p->signal))
- continue;
- spin_lock_irq(&p->sighand->siglock);
- if (p->signal->stop_state < 0) {
- pid_t pid;
- uid_t uid;
-
- if (!(options & WNOWAIT))
- p->signal->stop_state = 0;
- spin_unlock_irq(&p->sighand->siglock);
- pid = p->pid;
- uid = p->uid;
- get_task_struct(p);
- read_unlock(&tasklist_lock);
- retval = wait_noreap_copyout(p, pid,
- uid, CLD_CONTINUED,
- SIGCONT, infop, ru);
- BUG_ON(retval == 0);
+ retval = wait_task_continued(
+ p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ if (retval != 0) /* He released the lock. */
goto end;
- }
- spin_unlock_irq(&p->sighand->siglock);
break;
}
}
@@ -1376,6 +1438,8 @@ asmlinkage long sys_waitid(int which, pid_t pid,
struct siginfo __user *infop, int options,
struct rusage __user *ru)
{
+ long ret;
+
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
return -EINVAL;
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
@@ -1398,15 +1462,26 @@ asmlinkage long sys_waitid(int which, pid_t pid,
return -EINVAL;
}
- return do_wait(pid, options, infop, NULL, ru);
+ ret = do_wait(pid, options, infop, NULL, ru);
+
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
}
asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
int options, struct rusage __user *ru)
{
- if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+ long ret;
+
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
- return do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+ ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
}
#ifdef __ARCH_WANT_SYS_WAITPID
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c7ba481c9a5..3020dccc548f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(free_task);
void __put_task_struct(struct task_struct *tsk)
{
- WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE)));
+ WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
@@ -249,8 +249,8 @@ void __init fork_init(unsigned long mempages)
if(max_threads < 20)
max_threads = 20;
- init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
- init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
}
static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -762,8 +762,17 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
- if (f)
+ if (f) {
get_file(f);
+ } else {
+ /*
+ * The fd may be claimed in the fd bitmap but not yet
+ * instantiated in the files array if a sibling thread
+ * is partway through open(). So make sure that this
+ * fd is available to the new process.
+ */
+ FD_CLR(open_files - i, newf->open_fds);
+ }
*new_fds++ = f;
}
spin_unlock(&oldf->file_lock);
@@ -872,6 +881,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+ task_lock(current->group_leader);
+ memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+ task_unlock(current->group_leader);
+
return 0;
}
@@ -941,7 +954,7 @@ static task_t *copy_process(unsigned long clone_flags,
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
- p->rlim[RLIMIT_NPROC].rlim_cur) {
+ p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != &root_user)
goto bad_fork_free;
@@ -992,7 +1005,7 @@ static task_t *copy_process(unsigned long clone_flags,
p->utime = p->stime = 0;
p->lock_depth = -1; /* -1 = no lock */
- p->start_time = get_jiffies_64();
+ do_posix_clock_monotonic_gettime(&p->start_time);
p->security = NULL;
p->io_context = NULL;
p->io_wait = NULL;
@@ -1049,6 +1062,7 @@ static task_t *copy_process(unsigned long clone_flags,
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
+ p->exit_state = 0;
/* Perform scheduler related setup */
sched_fork(p);
@@ -1146,7 +1160,8 @@ fork_out:
bad_fork_cleanup_namespace:
exit_namespace(p);
bad_fork_cleanup_mm:
- mmput(p->mm);
+ if (p->mm)
+ mmput(p->mm);
bad_fork_cleanup_signal:
exit_signal(p);
bad_fork_cleanup_sighand:
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000000..66d3be6e4da8
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,4 @@
+
+obj-y := autoprobe.o handle.o manage.o spurious.o
+obj-$(CONFIG_PROC_FS) += proc.o
+
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000000..16818726cd21
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,188 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DECLARE_MUTEX(probe_sem);
+
+/**
+ * probe_irq_on - begin an interrupt autodetect
+ *
+ * Commence probing for an interrupt. The interrupts are scanned
+ * and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+ unsigned long val, delay;
+ irq_desc_t *desc;
+ unsigned int i;
+
+ down(&probe_sem);
+ /*
+ * something may have generated an irq long ago and we want to
+ * flush such a longstanding irq before considering it as spurious.
+ */
+ for (i = NR_IRQS-1; i > 0; i--) {
+ desc = irq_desc + i;
+
+ spin_lock_irq(&desc->lock);
+ if (!irq_desc[i].action)
+ irq_desc[i].handler->startup(i);
+ spin_unlock_irq(&desc->lock);
+ }
+
+ /* Wait for longstanding interrupts to trigger. */
+ for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
+ /* about 20ms delay */ barrier();
+
+ /*
+ * enable any unassigned irqs
+ * (we must startup again here because if a longstanding irq
+ * happened in the previous stage, it may have masked itself)
+ */
+ for (i = NR_IRQS-1; i > 0; i--) {
+ desc = irq_desc + i;
+
+ spin_lock_irq(&desc->lock);
+ if (!desc->action) {
+ desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+ if (desc->handler->startup(i))
+ desc->status |= IRQ_PENDING;
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ /*
+ * Wait for spurious interrupts to trigger
+ */
+ for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
+ /* about 100ms delay */ barrier();
+
+ /*
+ * Now filter out any obviously spurious interrupts
+ */
+ val = 0;
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc_t *desc = irq_desc + i;
+ unsigned int status;
+
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ /* It triggered already - consider it spurious. */
+ if (!(status & IRQ_WAITING)) {
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->handler->shutdown(i);
+ } else
+ if (i < 32)
+ val |= 1 << i;
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+
+ return val;
+}
+
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ * probe_irq_mask - scan a bitmap of interrupt lines
+ * @val: mask of interrupts to consider
+ *
+ * Scan the interrupt lines and return a bitmap of active
+ * autodetect interrupts. The interrupt probe logic state
+ * is then returned to its previous value.
+ *
+ * Note: we need to scan all the irq's even though we will
+ * only return autodetect irq numbers - just so that we reset
+ * them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+ unsigned int mask;
+ int i;
+
+ mask = 0;
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc_t *desc = irq_desc + i;
+ unsigned int status;
+
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ if (i < 16 && !(status & IRQ_WAITING))
+ mask |= 1 << i;
+
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->handler->shutdown(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+ up(&probe_sem);
+
+ return mask & val;
+}
+
+/**
+ * probe_irq_off - end an interrupt autodetect
+ * @val: mask of potential interrupts (unused)
+ *
+ * Scans the unused interrupt lines and returns the line which
+ * appears to have triggered the interrupt. If no interrupt was
+ * found then zero is returned. If more than one interrupt is
+ * found then minus the first candidate is returned to indicate
+ * their is doubt.
+ *
+ * The interrupt probe logic state is returned to its previous
+ * value.
+ *
+ * BUGS: When used in a module (which arguably shouldn't happen)
+ * nothing prevents two IRQ probe callers from overlapping. The
+ * results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+ int i, irq_found = 0, nr_irqs = 0;
+
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc_t *desc = irq_desc + i;
+ unsigned int status;
+
+ spin_lock_irq(&desc->lock);
+ status = desc->status;
+
+ if (status & IRQ_AUTODETECT) {
+ if (!(status & IRQ_WAITING)) {
+ if (!nr_irqs)
+ irq_found = i;
+ nr_irqs++;
+ }
+ desc->status = status & ~IRQ_AUTODETECT;
+ desc->handler->shutdown(i);
+ }
+ spin_unlock_irq(&desc->lock);
+ }
+ up(&probe_sem);
+
+ if (nr_irqs > 1)
+ irq_found = -irq_found;
+ return irq_found;
+}
+
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000000..6f0ebc254e3e
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,204 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the core interrupt handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the apropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+ [0 ... NR_IRQS-1] = {
+ .handler = &no_irq_type,
+ .lock = SPIN_LOCK_UNLOCKED
+ }
+};
+
+/*
+ * Generic 'no controller' code
+ */
+static void end_none(unsigned int irq) { }
+static void enable_none(unsigned int irq) { }
+static void disable_none(unsigned int irq) { }
+static void shutdown_none(unsigned int irq) { }
+static unsigned int startup_none(unsigned int irq) { return 0; }
+
+static void ack_none(unsigned int irq)
+{
+ /*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themself.
+ */
+ ack_bad_irq(irq);
+}
+
+struct hw_interrupt_type no_irq_type = {
+ .typename = "none",
+ .startup = startup_none,
+ .shutdown = shutdown_none,
+ .enable = enable_none,
+ .disable = disable_none,
+ .ack = ack_none,
+ .end = end_none,
+ .set_affinity = NULL
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+{
+ return IRQ_NONE;
+}
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+ preempt_count() -= IRQ_EXIT_OFFSET;
+ if (!in_interrupt() && local_softirq_pending())
+ do_softirq();
+ preempt_enable_no_resched();
+}
+
+/*
+ * Have got an event to handle:
+ */
+asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+ struct irqaction *action)
+{
+ int ret, retval = 0, status = 0;
+
+ if (!(action->flags & SA_INTERRUPT))
+ local_irq_enable();
+
+ do {
+ ret = action->handler(irq, action->dev_id, regs);
+ if (ret == IRQ_HANDLED)
+ status |= action->flags;
+ retval |= ret;
+ action = action->next;
+ } while (action);
+
+ if (status & SA_SAMPLE_RANDOM)
+ add_interrupt_randomness(irq);
+ local_irq_disable();
+
+ return retval;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ struct irqaction * action;
+ unsigned int status;
+
+ kstat_this_cpu.irqs[irq]++;
+ if (desc->status & IRQ_PER_CPU) {
+ irqreturn_t action_ret;
+
+ /*
+ * No locking required for CPU-local interrupts:
+ */
+ desc->handler->ack(irq);
+ action_ret = handle_IRQ_event(irq, regs, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+ desc->handler->end(irq);
+ return 1;
+ }
+
+ spin_lock(&desc->lock);
+ desc->handler->ack(irq);
+ /*
+ * REPLAY is when Linux resends an IRQ that was dropped earlier
+ * WAITING is used by probe to mark irqs that are being tested
+ */
+ status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+ status |= IRQ_PENDING; /* we _want_ to handle it */
+
+ /*
+ * If the IRQ is disabled for whatever reason, we cannot
+ * use the action we have.
+ */
+ action = NULL;
+ if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+ action = desc->action;
+ status &= ~IRQ_PENDING; /* we commit to handling */
+ status |= IRQ_INPROGRESS; /* we are handling it */
+ }
+ desc->status = status;
+
+ /*
+ * If there is no IRQ handler or it was disabled, exit early.
+ * Since we set PENDING, if another processor is handling
+ * a different instance of this same irq, the other processor
+ * will take care of it.
+ */
+ if (unlikely(!action))
+ goto out;
+
+ /*
+ * Edge triggered interrupts need to remember
+ * pending events.
+ * This applies to any hw interrupts that allow a second
+ * instance of the same irq to arrive while we are in do_IRQ
+ * or in the handler. But the code here only handles the _second_
+ * instance of the irq, not the third or fourth. So it is mostly
+ * useful for irq hardware that does not mask cleanly in an
+ * SMP environment.
+ */
+ for (;;) {
+ irqreturn_t action_ret;
+
+ spin_unlock(&desc->lock);
+
+ action_ret = handle_IRQ_event(irq, regs, action);
+
+ spin_lock(&desc->lock);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+ if (likely(!(desc->status & IRQ_PENDING)))
+ break;
+ desc->status &= ~IRQ_PENDING;
+ }
+ desc->status &= ~IRQ_INPROGRESS;
+
+out:
+ /*
+ * The ->end() handler has to deal with interrupts which got
+ * disabled while the handler was running.
+ */
+ desc->handler->end(irq);
+ spin_unlock(&desc->lock);
+
+ return 1;
+}
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000000..46feba630266
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,18 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+#endif
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000000..a164fe243338
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,347 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ while (desc->status & IRQ_INPROGRESS)
+ cpu_relax();
+}
+
+EXPORT_SYMBOL(synchronize_irq);
+
+#endif
+
+/**
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ *
+ * This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (!desc->depth++) {
+ desc->status |= IRQ_DISABLED;
+ desc->handler->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+ irq_desc_t *desc = irq_desc + irq;
+
+ disable_irq_nosync(irq);
+ if (desc->action)
+ synchronize_irq(irq);
+}
+
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
+ *
+ * Undoes the effect of one call to disable_irq(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ *
+ * This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ switch (desc->depth) {
+ case 0:
+ WARN_ON(1);
+ break;
+ case 1: {
+ unsigned int status = desc->status & ~IRQ_DISABLED;
+
+ desc->status = status;
+ if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+ desc->status = status | IRQ_REPLAY;
+ hw_resend_irq(desc->handler,irq);
+ }
+ desc->handler->enable(irq);
+ /* fall-through */
+ }
+ default:
+ desc->depth--;
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(enable_irq);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+ struct irqaction *action;
+
+ if (irq >= NR_IRQS)
+ return 0;
+
+ action = irq_desc[irq].action;
+ if (action)
+ if (irqflags & action->flags & SA_SHIRQ)
+ action = NULL;
+
+ return !action;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+int setup_irq(unsigned int irq, struct irqaction * new)
+{
+ struct irq_desc *desc = irq_desc + irq;
+ struct irqaction *old, **p;
+ unsigned long flags;
+ int shared = 0;
+
+ if (desc->handler == &no_irq_type)
+ return -ENOSYS;
+ /*
+ * Some drivers like serial.c use request_irq() heavily,
+ * so we have to be careful not to interfere with a
+ * running system.
+ */
+ if (new->flags & SA_SAMPLE_RANDOM) {
+ /*
+ * This function might sleep, we want to call it first,
+ * outside of the atomic block.
+ * Yes, this might clear the entropy pool if the wrong
+ * driver is attempted to be loaded, without actually
+ * installing a new handler, but is this really a problem,
+ * only the sysadmin is able to do this.
+ */
+ rand_initialize_irq(irq);
+ }
+
+ /*
+ * The following block of code has to be executed atomically
+ */
+ spin_lock_irqsave(&desc->lock,flags);
+ p = &desc->action;
+ if ((old = *p) != NULL) {
+ /* Can't share interrupts unless both agree to */
+ if (!(old->flags & new->flags & SA_SHIRQ)) {
+ spin_unlock_irqrestore(&desc->lock,flags);
+ return -EBUSY;
+ }
+
+ /* add new interrupt at end of irq queue */
+ do {
+ p = &old->next;
+ old = *p;
+ } while (old);
+ shared = 1;
+ }
+
+ *p = new;
+
+ if (!shared) {
+ desc->depth = 0;
+ desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
+ IRQ_WAITING | IRQ_INPROGRESS);
+ if (desc->handler->startup)
+ desc->handler->startup(irq);
+ else
+ desc->handler->enable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock,flags);
+
+ new->irq = irq;
+ register_irq_proc(irq);
+ new->dir = NULL;
+ register_handler_proc(irq, new);
+
+ return 0;
+}
+
+/**
+ * free_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc;
+ struct irqaction **p;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS)
+ return;
+
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock,flags);
+ p = &desc->action;
+ for (;;) {
+ struct irqaction * action = *p;
+
+ if (action) {
+ struct irqaction **pp = p;
+
+ p = &action->next;
+ if (action->dev_id != dev_id)
+ continue;
+
+ /* Found it - now remove it from the list of entries */
+ *pp = action->next;
+ if (!desc->action) {
+ desc->status |= IRQ_DISABLED;
+ if (desc->handler->shutdown)
+ desc->handler->shutdown(irq);
+ else
+ desc->handler->disable(irq);
+ }
+ spin_unlock_irqrestore(&desc->lock,flags);
+ unregister_handler_proc(irq, action);
+
+ /* Make sure it's not being used on another CPU */
+ synchronize_irq(irq);
+ kfree(action);
+ return;
+ }
+ printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+ spin_unlock_irqrestore(&desc->lock,flags);
+ return;
+ }
+}
+
+EXPORT_SYMBOL(free_irq);
+
+/**
+ * request_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. From the point this
+ * call is made your handler function may be invoked. Since
+ * your handler function must clear any interrupt the board
+ * raises, you must take care both to initialise your hardware
+ * and to set up the interrupt handler in the right order.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id
+ * as this is required when freeing the interrupt.
+ *
+ * Flags:
+ *
+ * SA_SHIRQ Interrupt is shared
+ * SA_INTERRUPT Disable local interrupts while processing
+ * SA_SAMPLE_RANDOM The interrupt can be used for entropy
+ *
+ */
+int request_irq(unsigned int irq,
+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
+ unsigned long irqflags, const char * devname, void *dev_id)
+{
+ struct irqaction * action;
+ int retval;
+
+ /*
+ * Sanity-check: shared interrupts must pass in a real dev-ID,
+ * otherwise we'll have trouble later trying to figure out
+ * which interrupt is which (messes up the interrupt freeing
+ * logic etc).
+ */
+ if ((irqflags & SA_SHIRQ) && !dev_id)
+ return -EINVAL;
+ if (irq >= NR_IRQS)
+ return -EINVAL;
+ if (!handler)
+ return -EINVAL;
+
+ action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags;
+ cpus_clear(action->mask);
+ action->name = devname;
+ action->next = NULL;
+ action->dev_id = dev_id;
+
+ retval = setup_irq(irq, action);
+ if (retval)
+ kfree(action);
+
+ return retval;
+}
+
+EXPORT_SYMBOL(request_irq);
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000000..2ddbe8404c9c
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,156 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+
+static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+
+#ifdef CONFIG_SMP
+
+/*
+ * The /proc/irq/<irq>/smp_affinity values:
+ */
+static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
+
+cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+
+static int irq_affinity_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+
+ if (count - len < 2)
+ return -EINVAL;
+ len += sprintf(page + len, "\n");
+ return len;
+}
+
+int no_irq_affinity;
+static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ unsigned int irq = (int)(long)data, full_count = count, err;
+ cpumask_t new_value, tmp;
+
+ if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
+ return -EIO;
+
+ err = cpumask_parse(buffer, count, new_value);
+ if (err)
+ return err;
+
+ /*
+ * Do not allow disabling IRQs completely - it's a too easy
+ * way to make the system unusable accidentally :-) At least
+ * one online CPU still has to be targeted.
+ */
+ cpus_and(tmp, new_value, cpu_online_map);
+ if (cpus_empty(tmp))
+ return -EINVAL;
+
+ irq_affinity[irq] = new_value;
+ irq_desc[irq].handler->set_affinity(irq,
+ cpumask_of_cpu(first_cpu(new_value)));
+
+ return full_count;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+ struct irq_desc *desc = irq_desc + irq;
+ struct irqaction *action;
+
+ for (action = desc->action ; action; action = action->next)
+ if ((action != new_action) && action->name &&
+ !strcmp(new_action->name, action->name))
+ return 0;
+ return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ char name [MAX_NAMELEN];
+
+ if (!irq_dir[irq] || action->dir || !action->name ||
+ !name_unique(irq, action))
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+ /* create /proc/irq/1234/handler/ */
+ action->dir = proc_mkdir(name, irq_dir[irq]);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq)
+{
+ char name [MAX_NAMELEN];
+
+ if (!root_irq_dir ||
+ (irq_desc[irq].handler == &no_irq_type) ||
+ irq_dir[irq])
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%d", irq);
+
+ /* create /proc/irq/1234 */
+ irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+ {
+ struct proc_dir_entry *entry;
+
+ /* create /proc/irq/<irq>/smp_affinity */
+ entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+
+ if (entry) {
+ entry->nlink = 1;
+ entry->data = (void *)(long)irq;
+ entry->read_proc = irq_affinity_read_proc;
+ entry->write_proc = irq_affinity_write_proc;
+ }
+ smp_affinity_entry[irq] = entry;
+ }
+#endif
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ if (action->dir)
+ remove_proc_entry(action->dir->name, irq_dir[irq]);
+}
+
+void init_irq_proc(void)
+{
+ int i;
+
+ /* create /proc/irq */
+ root_irq_dir = proc_mkdir("irq", NULL);
+ if (!root_irq_dir)
+ return;
+
+ /*
+ * Create entries for all existing IRQs.
+ */
+ for (i = 0; i < NR_IRQS; i++)
+ register_irq_proc(i);
+}
+
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000000..f6297c306905
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,96 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ * functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+ struct irqaction *action;
+
+ if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+ printk(KERN_ERR "irq event %d: bogus return value %x\n",
+ irq, action_ret);
+ } else {
+ printk(KERN_ERR "irq %d: nobody cared!\n", irq);
+ }
+ dump_stack();
+ printk(KERN_ERR "handlers:\n");
+ action = desc->action;
+ while (action) {
+ printk(KERN_ERR "[<%p>]", action->handler);
+ print_symbol(" (%s)",
+ (unsigned long)action->handler);
+ printk("\n");
+ action = action->next;
+ }
+}
+
+void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+ static int count = 100;
+
+ if (count > 0) {
+ count--;
+ __report_bad_irq(irq, desc, action_ret);
+ }
+}
+
+void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+ if (action_ret != IRQ_HANDLED) {
+ desc->irqs_unhandled++;
+ if (action_ret != IRQ_NONE)
+ report_bad_irq(irq, desc, action_ret);
+ }
+
+ desc->irq_count++;
+ if (desc->irq_count < 100000)
+ return;
+
+ desc->irq_count = 0;
+ if (desc->irqs_unhandled > 99900) {
+ /*
+ * The interrupt is stuck
+ */
+ __report_bad_irq(irq, desc, action_ret);
+ /*
+ * Now kill the IRQ
+ */
+ printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ desc->status |= IRQ_DISABLED;
+ desc->handler->disable(irq);
+ }
+ desc->irqs_unhandled = 0;
+}
+
+int noirqdebug;
+
+int __init noirqdebug_setup(char *str)
+{
+ noirqdebug = 1;
+ printk(KERN_INFO "IRQ lockup detection disabled\n");
+ return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6918cb7460a8..95fbf1c6becf 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
+#include <linux/syscalls.h>
#include <linux/time.h>
#include <asm/uaccess.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 01436a31c690..ca4e28b4c6a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -25,6 +25,8 @@
* hlists and exceptions notifier as suggested by Andi Kleen.
* 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
* interface to access function arguments.
+ * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ * exceptions notifier to be first on the priority list.
*/
#include <linux/kprobes.h>
#include <linux/spinlock.h>
@@ -108,6 +110,7 @@ void unregister_kprobe(struct kprobe *p)
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
+ .priority = 0x7fffffff /* we need to notified first */
};
int register_jprobe(struct jprobe *jp)
diff --git a/kernel/panic.c b/kernel/panic.c
index b3abe97f88a6..fce7f4030d0a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -16,7 +16,6 @@
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/sysrq.h>
-#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/nmi.h>
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ef5c42101748..c2dc4f89da41 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -43,6 +43,7 @@
#include <linux/compiler.h>
#include <linux/idr.h>
#include <linux/posix-timers.h>
+#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
@@ -219,11 +220,6 @@ static __init int init_posix_timers(void)
.clock_set = do_posix_clock_monotonic_settime
};
-#ifdef CONFIG_TIME_INTERPOLATION
- /* Clocks are more accurate with time interpolators */
- clock_realtime.res = clock_monotonic.res = time_interpolator_resolution();
-#endif
-
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 00b390d7a5ad..7ff375e7c95f 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -11,7 +11,9 @@
static int new_loglevel = 10;
static int orig_loglevel;
+#ifdef SUSPEND_CONSOLE
static int orig_fgconsole, orig_kmsg;
+#endif
int pm_prepare_console(void)
{
@@ -50,6 +52,7 @@ void pm_restore_console(void)
acquire_console_sem();
set_console(orig_fgconsole);
release_console_sem();
+ kmsg_redirect = orig_kmsg;
#endif
return;
}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 312aa169c566..4cc81e03085c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -85,10 +85,20 @@ static int in_suspend __nosavedata = 0;
static void free_some_memory(void)
{
- printk("Freeing memory: ");
- while (shrink_all_memory(10000))
- printk(".");
- printk("|\n");
+ unsigned int i = 0;
+ unsigned int tmp;
+ unsigned long pages = 0;
+ char *p = "-\\|/";
+
+ printk("Freeing memory... ");
+ while ((tmp = shrink_all_memory(10000))) {
+ pages += tmp;
+ printk("\b%c", p[i]);
+ i++;
+ if (i > 3)
+ i = 0;
+ }
+ printk("\bdone (%li pages freed)\n", pages);
}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index bda013de59a5..78d92dc6a1ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -23,8 +23,8 @@ static inline int freezeable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
- (p->state == TASK_ZOMBIE) ||
- (p->state == TASK_DEAD) ||
+ (p->exit_state == EXIT_ZOMBIE) ||
+ (p->exit_state == EXIT_DEAD) ||
(p->state == TASK_STOPPED) ||
(p->state == TASK_TRACED))
return 0;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f65d295478c4..375299c86c45 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -294,15 +294,19 @@ static int data_write(void)
{
int error = 0;
int i;
+ unsigned int mod = nr_copy_pages / 100;
- printk( "Writing data to swap (%d pages): ", nr_copy_pages );
+ if (!mod)
+ mod = 1;
+
+ printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
for (i = 0; i < nr_copy_pages && !error; i++) {
- if (!(i%100))
- printk( "." );
+ if (!(i%mod))
+ printk( "\b\b\b\b%3d%%", i / mod );
error = write_page((pagedir_nosave+i)->address,
&((pagedir_nosave+i)->swap_address));
}
- printk(" %d Pages done.\n",i);
+ printk("\b\b\b\bdone\n");
return error;
}
@@ -856,7 +860,9 @@ int swsusp_suspend(void)
local_irq_disable();
save_processor_state();
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
restore_processor_state();
+ restore_highmem();
local_irq_enable();
return error;
}
@@ -876,8 +882,13 @@ int swsusp_resume(void)
{
int error;
local_irq_disable();
+ /* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = swsusp_arch_resume();
+ /* Code below is only ever reached in case of failure. Otherwise
+ * execution continues at place where swsusp_arch_suspend was called
+ */
+ BUG_ON(!error);
restore_processor_state();
restore_highmem();
local_irq_enable();
@@ -1134,14 +1145,18 @@ static int __init data_read(void)
struct pbe * p;
int error;
int i;
+ int mod = nr_copy_pages / 100;
+
+ if (!mod)
+ mod = 1;
if ((error = swsusp_pagedir_relocate()))
return error;
- printk( "Reading image data (%d pages): ", nr_copy_pages );
+ printk( "Reading image data (%d pages): ", nr_copy_pages );
for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) {
- if (!(i%100))
- printk( "." );
+ if (!(i%mod))
+ printk( "\b\b\b\b%3d%%", i / mod );
error = bio_read_page(swp_offset(p->swap_address),
(void *)p->address);
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 18dba205c171..c02ec626f384 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -30,6 +30,7 @@
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index cab14764cea0..1c4375fad923 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,14 +22,14 @@ static int __init profile_setup(char * str)
int par;
if (!strncmp(str, "schedule", 8)) {
- prof_on = 2;
+ prof_on = SCHED_PROFILING;
printk(KERN_INFO "kernel schedule profiling enabled\n");
if (str[7] == ',')
str += 8;
}
if (get_option(&str,&par)) {
prof_shift = par;
- prof_on = 1;
+ prof_on = CPU_PROFILING;
printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
prof_shift);
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b14b4a467729..c3ac348e2368 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -82,7 +82,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
*/
read_lock(&tasklist_lock);
if ((child->ptrace & PT_PTRACED) && child->parent == current &&
- child->signal != NULL) {
+ (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
+ && child->signal != NULL) {
ret = 0;
spin_lock_irq(&child->sighand->siglock);
if (child->state == TASK_STOPPED) {
@@ -131,7 +132,7 @@ int ptrace_attach(struct task_struct *task)
goto bad;
/* Go */
- task->ptrace |= PT_PTRACED;
+ task->ptrace |= PT_PTRACED | PT_ATTACHED;
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
task_unlock(task);
@@ -162,7 +163,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
write_lock_irq(&tasklist_lock);
__ptrace_unlink(child);
/* .. and wake it up. */
- if (child->state != TASK_ZOMBIE)
+ if (child->exit_state != EXIT_ZOMBIE)
wake_up_process(child);
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9b395e8d616d..8e5e2af64509 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -42,6 +42,7 @@
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/seq_file.h>
+#include <linux/syscalls.h>
#include <linux/times.h>
#include <asm/tlb.h>
@@ -180,17 +181,8 @@ static unsigned int task_timeslice(task_t *p)
else
return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
}
-#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
-
-enum idle_type
-{
- IDLE,
- NOT_IDLE,
- NEWLY_IDLE,
- MAX_IDLE_TYPES
-};
-
-struct sched_domain;
+#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
+ < (long long) (sd)->cache_hot_time)
/*
* These are the runqueue data structures:
@@ -289,140 +281,6 @@ struct runqueue {
static DEFINE_PER_CPU(struct runqueue, runqueues);
-/*
- * sched-domains (multiprocessor balancing) declarations:
- */
-#ifdef CONFIG_SMP
-#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
-
-#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */
-#define SD_BALANCE_EXEC 2 /* Balance on exec */
-#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */
-#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */
-
-struct sched_group {
- struct sched_group *next; /* Must be a circular list */
- cpumask_t cpumask;
-
- /*
- * CPU power of this group, SCHED_LOAD_SCALE being max power for a
- * single CPU. This should be read only (except for setup). Although
- * it will need to be written to at cpu hot(un)plug time, perhaps the
- * cpucontrol semaphore will provide enough exclusion?
- */
- unsigned long cpu_power;
-};
-
-struct sched_domain {
- /* These fields must be setup */
- struct sched_domain *parent; /* top domain must be null terminated */
- struct sched_group *groups; /* the balancing groups of the domain */
- cpumask_t span; /* span of all CPUs in this domain */
- unsigned long min_interval; /* Minimum balance interval ms */
- unsigned long max_interval; /* Maximum balance interval ms */
- unsigned int busy_factor; /* less balancing by factor if busy */
- unsigned int imbalance_pct; /* No balance until over watermark */
- unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
- unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
- unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
- int flags; /* See SD_* */
-
- /* Runtime fields. */
- unsigned long last_balance; /* init to jiffies. units in jiffies */
- unsigned int balance_interval; /* initialise to 1. units in ms. */
- unsigned int nr_balance_failed; /* initialise to 0 */
-
-#ifdef CONFIG_SCHEDSTATS
- /* load_balance() stats */
- unsigned long lb_cnt[MAX_IDLE_TYPES];
- unsigned long lb_failed[MAX_IDLE_TYPES];
- unsigned long lb_imbalance[MAX_IDLE_TYPES];
- unsigned long lb_nobusyg[MAX_IDLE_TYPES];
- unsigned long lb_nobusyq[MAX_IDLE_TYPES];
-
- /* sched_balance_exec() stats */
- unsigned long sbe_attempts;
- unsigned long sbe_pushed;
-
- /* try_to_wake_up() stats */
- unsigned long ttwu_wake_affine;
- unsigned long ttwu_wake_balance;
-#endif
-};
-
-#ifndef ARCH_HAS_SCHED_TUNE
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#define SD_SIBLING_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 1, \
- .max_interval = 2, \
- .busy_factor = 8, \
- .imbalance_pct = 110, \
- .cache_hot_time = 0, \
- .cache_nice_tries = 0, \
- .per_cpu_gain = 25, \
- .flags = SD_BALANCE_NEWIDLE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_IDLE \
- | SD_SHARE_CPUPOWER, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-#endif
-
-/* Common values for CPUs */
-#define SD_CPU_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 1, \
- .max_interval = 4, \
- .busy_factor = 64, \
- .imbalance_pct = 125, \
- .cache_hot_time = (5*1000000/2), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_BALANCE_NEWIDLE \
- | SD_BALANCE_EXEC \
- | SD_WAKE_AFFINE \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-
-/* Arch can override this macro in processor.h */
-#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
-#define SD_NODE_INIT (struct sched_domain) { \
- .span = CPU_MASK_NONE, \
- .parent = NULL, \
- .groups = NULL, \
- .min_interval = 8, \
- .max_interval = 32, \
- .busy_factor = 32, \
- .imbalance_pct = 125, \
- .cache_hot_time = (10*1000000), \
- .cache_nice_tries = 1, \
- .per_cpu_gain = 100, \
- .flags = SD_BALANCE_EXEC \
- | SD_WAKE_BALANCE, \
- .last_balance = jiffies, \
- .balance_interval = 1, \
- .nr_balance_failed = 0, \
-}
-#endif
-#endif /* ARCH_HAS_SCHED_TUNE */
-#endif
-
-
#define for_each_domain(cpu, domain) \
for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
@@ -501,7 +359,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
- for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
+ for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
rq->pt_lost[itype]);
seq_printf(seq, "\n");
@@ -513,7 +371,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
- for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
+ for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+ itype++) {
seq_printf(seq, " %lu %lu %lu %lu %lu",
sd->lb_cnt[itype],
sd->lb_failed[itype],
@@ -1087,8 +946,7 @@ static int wake_idle(int cpu, task_t *p)
if (!(sd->flags & SD_WAKE_IDLE))
return cpu;
- cpus_and(tmp, sd->span, cpu_online_map);
- cpus_and(tmp, tmp, p->cpus_allowed);
+ cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
@@ -1123,14 +981,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
int cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
- runqueue_t *rq;
+ runqueue_t *rq, *old_rq;
#ifdef CONFIG_SMP
unsigned long load, this_load;
struct sched_domain *sd;
int new_cpu;
#endif
- rq = task_rq_lock(p, &flags);
+ old_rq = rq = task_rq_lock(p, &flags);
schedstat_inc(rq, ttwu_cnt);
old_state = p->state;
if (!(old_state & state))
@@ -1225,7 +1083,7 @@ out_set_cpu:
out_activate:
#endif /* CONFIG_SMP */
if (old_state == TASK_UNINTERRUPTIBLE) {
- rq->nr_uninterruptible--;
+ old_rq->nr_uninterruptible--;
/*
* Tasks on involuntary sleep don't earn
* sleep_avg beyond just interactive state.
@@ -1471,10 +1329,10 @@ static void finish_task_switch(task_t *prev)
/*
* A task struct has one reference for the use as "current".
- * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
- * schedule one last time. The schedule call will never return,
+ * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+ * calls schedule one last time. The schedule call will never return,
* and the scheduled task must drop that reference.
- * The test for TASK_ZOMBIE must occur while the runqueue locks are
+ * The test for EXIT_ZOMBIE must occur while the runqueue locks are
* still held, otherwise prev could be scheduled on another cpu, die
* there before we look at prev->state, and then the reference would
* be dropped twice.
@@ -1640,8 +1498,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
min_cpu = UINT_MAX;
min_load = ULONG_MAX;
- cpus_and(mask, sd->span, cpu_online_map);
- cpus_and(mask, mask, p->cpus_allowed);
+ cpus_and(mask, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, mask) {
load = target_load(i);
@@ -1893,7 +1750,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
max_load = this_load = total_load = total_pwr = 0;
do {
- cpumask_t tmp;
unsigned long load;
int local_group;
int i, nr_cpus = 0;
@@ -1902,11 +1758,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
/* Tally up the load of all CPUs in the group */
avg_load = 0;
- cpus_and(tmp, group->cpumask, cpu_online_map);
- if (unlikely(cpus_empty(tmp)))
- goto nextgroup;
- for_each_cpu_mask(i, tmp) {
+ for_each_cpu_mask(i, group->cpumask) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i);
@@ -2011,7 +1864,7 @@ nextgroup:
out_balanced:
if (busiest && (idle == NEWLY_IDLE ||
- (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+ (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
*imbalance = 1;
return busiest;
}
@@ -2025,13 +1878,11 @@ out_balanced:
*/
static runqueue_t *find_busiest_queue(struct sched_group *group)
{
- cpumask_t tmp;
unsigned long load, max_load = 0;
runqueue_t *busiest = NULL;
int i;
- cpus_and(tmp, group->cpumask, cpu_online_map);
- for_each_cpu_mask(i, tmp) {
+ for_each_cpu_mask(i, group->cpumask) {
load = source_load(i);
if (load > max_load) {
@@ -2232,18 +2083,13 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
group = sd->groups;
do {
- cpumask_t tmp;
runqueue_t *rq;
int push_cpu = 0;
if (group == busy_group)
goto next_group;
- cpus_and(tmp, group->cpumask, cpu_online_map);
- if (!cpus_weight(tmp))
- goto next_group;
-
- for_each_cpu_mask(i, tmp) {
+ for_each_cpu_mask(i, group->cpumask) {
if (!idle_cpu(i))
goto next_group;
push_cpu = i;
@@ -2260,7 +2106,7 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
if (unlikely(busiest == rq))
goto next_group;
double_lock_balance(busiest, rq);
- if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
+ if (move_tasks(rq, push_cpu, busiest, 1, sd, SCHED_IDLE)) {
schedstat_inc(busiest, alb_lost);
schedstat_inc(rq, alb_gained);
} else {
@@ -2304,9 +2150,13 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
this_rq->cpu_load = (old_load + this_load) / 2;
for_each_domain(this_cpu, sd) {
- unsigned long interval = sd->balance_interval;
+ unsigned long interval;
- if (idle != IDLE)
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ interval = sd->balance_interval;
+ if (idle != SCHED_IDLE)
interval *= sd->busy_factor;
/* scale ms to jiffies */
@@ -2408,7 +2258,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
cpustat->idle += sys_ticks;
if (wake_priority_sleeper(rq))
goto out;
- rebalance_tick(cpu, rq, IDLE);
+ rebalance_tick(cpu, rq, SCHED_IDLE);
return;
}
if (TASK_NICE(p) > 0)
@@ -2512,7 +2362,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
*/
spin_unlock(&this_rq->lock);
- cpus_and(sibling_map, sd->span, cpu_online_map);
+ sibling_map = sd->span;
for_each_cpu_mask(i, sibling_map)
spin_lock(&cpu_rq(i)->lock);
@@ -2557,7 +2407,7 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
* wake_sleeping_dependent():
*/
spin_unlock(&this_rq->lock);
- cpus_and(sibling_map, sd->span, cpu_online_map);
+ sibling_map = sd->span;
for_each_cpu_mask(i, sibling_map)
spin_lock(&cpu_rq(i)->lock);
cpu_clear(this_cpu, sibling_map);
@@ -2639,12 +2489,15 @@ asmlinkage void __sched schedule(void)
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
- if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+ if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
if (unlikely(in_atomic())) {
- printk(KERN_ERR "bad: scheduling while atomic!\n");
+ printk(KERN_ERR "scheduling while atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(), current->pid);
dump_stack();
}
}
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
need_resched:
preempt_disable();
@@ -2678,6 +2531,8 @@ need_resched:
spin_lock_irq(&rq->lock);
+ if (unlikely(current->flags & PF_DEAD))
+ current->state = EXIT_DEAD;
/*
* if entering off of a kernel preemption go straight
* to picking the next task.
@@ -2764,7 +2619,7 @@ switch_tasks:
if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
prev->interactive_credit--;
}
- prev->timestamp = now;
+ prev->timestamp = prev->last_ran = now;
sched_info_switch(prev, next);
if (likely(prev != next)) {
@@ -3221,7 +3076,6 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
policy != SCHED_NORMAL)
goto out_unlock;
}
- profile_hit(SCHED_PROFILING, __builtin_return_address(0));
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4068,7 +3922,7 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
struct runqueue *rq = cpu_rq(dead_cpu);
/* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD);
+ BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
/* Cannot have done final schedule yet: would have vanished. */
BUG_ON(tsk->flags & PF_DEAD);
@@ -4209,16 +4063,17 @@ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
EXPORT_SYMBOL(kernel_flag);
#ifdef CONFIG_SMP
-/* Attach the domain 'sd' to 'cpu' as its base domain */
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
{
migration_req_t req;
unsigned long flags;
runqueue_t *rq = cpu_rq(cpu);
int local = 1;
- lock_cpu_hotplug();
-
spin_lock_irqsave(&rq->lock, flags);
if (cpu == smp_processor_id() || !cpu_online(cpu)) {
@@ -4237,128 +4092,10 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
wake_up_process(rq->migration_thread);
wait_for_completion(&req.done);
}
-
- unlock_cpu_hotplug();
}
-/*
- * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN
- * in arch code. That defines the number of nearby nodes in a node's top
- * level scheduling domain.
- */
-#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN)
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int __init find_next_best_node(int node, unsigned long *used_nodes)
-{
- int i, n, val, min_val, best_node = 0;
-
- min_val = INT_MAX;
-
- for (i = 0; i < numnodes; i++) {
- /* Start at @node */
- n = (node + i) % numnodes;
-
- /* Skip already used nodes */
- if (test_bit(n, used_nodes))
- continue;
-
- /* Simple min distance search */
- val = node_distance(node, i);
-
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- set_bit(best_node, used_nodes);
- return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-cpumask_t __init sched_domain_node_span(int node)
-{
- int i;
- cpumask_t span;
- DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
- cpus_clear(span);
- bitmap_zero(used_nodes, MAX_NUMNODES);
-
- for (i = 0; i < SD_NODES_PER_DOMAIN; i++) {
- int next_node = find_next_best_node(node, used_nodes);
- cpumask_t nodemask;
-
- nodemask = node_to_cpumask(next_node);
- cpus_or(span, span, nodemask);
- }
-
- return span;
-}
-#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
-cpumask_t __init sched_domain_node_span(int node)
-{
- return cpu_possible_map;
-}
-#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
-
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-__init static int cpu_to_cpu_group(int cpu)
-{
- return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-__init static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- return first_cpu(cpu_sibling_map[cpu]);
-#else
- return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-__init static int cpu_to_node_group(int cpu)
-{
- return cpu_to_node(cpu);
-}
-#endif
-
-/* Groups for isolated scheduling domains */
-static struct sched_group sched_group_isolated[NR_CPUS];
-
/* cpus with isolated domains */
-cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE;
-
-__init static int cpu_to_isolated_group(int cpu)
-{
- return cpu;
-}
+cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
@@ -4385,7 +4122,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-__init static void init_sched_build_groups(struct sched_group groups[],
+void __devinit init_sched_build_groups(struct sched_group groups[],
cpumask_t span, int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
@@ -4419,56 +4156,99 @@ __init static void init_sched_build_groups(struct sched_group groups[],
last->next = first;
}
-__init static void arch_init_sched_domains(void)
+
+#ifdef ARCH_HAS_SCHED_DOMAIN
+extern void __devinit arch_init_sched_domains(void);
+extern void __devinit arch_destroy_sched_domains(void);
+#else
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+static int __devinit cpu_to_cpu_group(int cpu)
+{
+ return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+static int __devinit cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ return first_cpu(cpu_sibling_map[cpu]);
+#else
+ return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static int __devinit cpu_to_node_group(int cpu)
+{
+ return cpu_to_node(cpu);
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+/*
+ * The domains setup code relies on siblings not spanning
+ * multiple nodes. Make sure the architecture has a proper
+ * siblings map:
+ */
+static void check_sibling_maps(void)
+{
+ int i, j;
+
+ for_each_online_cpu(i) {
+ for_each_cpu_mask(j, cpu_sibling_map[i]) {
+ if (cpu_to_node(i) != cpu_to_node(j)) {
+ printk(KERN_INFO "warning: CPU %d siblings map "
+ "to different node - isolating "
+ "them.\n", i);
+ cpu_sibling_map[i] = cpumask_of_cpu(i);
+ break;
+ }
+ }
+ }
+}
+#endif
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ */
+static void __devinit arch_init_sched_domains(void)
{
int i;
cpumask_t cpu_default_map;
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+ check_sibling_maps();
+#endif
/*
* Setup mask for cpus without special case scheduling requirements.
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
cpus_complement(cpu_default_map, cpu_isolated_map);
- cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map);
+ cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
- /* Set up domains */
- for_each_cpu(i) {
+ /*
+ * Set up domains. Isolated domains just stay on the dummy domain.
+ */
+ for_each_cpu_mask(i, cpu_default_map) {
int group;
struct sched_domain *sd = NULL, *p;
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
cpus_and(nodemask, nodemask, cpu_default_map);
- /*
- * Set up isolated domains.
- * Unlike those of other cpus, the domains and groups are
- * single level, and span a single cpu.
- */
- if (cpu_isset(i, cpu_isolated_map)) {
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i);
-#else
- sd = &per_cpu(phys_domains, i);
-#endif
- group = cpu_to_isolated_group(i);
- *sd = SD_CPU_INIT;
- cpu_set(i, sd->span);
- sd->balance_interval = INT_MAX; /* Don't balance */
- sd->flags = 0; /* Avoid WAKE_ */
- sd->groups = &sched_group_isolated[group];
- printk(KERN_INFO "Setting up cpu %d isolated.\n", i);
- /* Single level, so continue with next cpu */
- continue;
- }
-
#ifdef CONFIG_NUMA
sd = &per_cpu(node_domains, i);
group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
- /* FIXME: should be multilevel, in arch code */
- sd->span = sched_domain_node_span(i);
- cpus_and(sd->span, sd->span, cpu_default_map);
+ sd->span = cpu_default_map;
sd->groups = &sched_group_nodes[group];
#endif
@@ -4476,11 +4256,7 @@ __init static void arch_init_sched_domains(void)
sd = &per_cpu(phys_domains, i);
group = cpu_to_phys_group(i);
*sd = SD_CPU_INIT;
-#ifdef CONFIG_NUMA
sd->span = nodemask;
-#else
- sd->span = cpu_possible_map;
-#endif
sd->parent = p;
sd->groups = &sched_group_phys[group];
@@ -4498,7 +4274,7 @@ __init static void arch_init_sched_domains(void)
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
- for_each_cpu(i) {
+ for_each_online_cpu(i) {
cpumask_t this_sibling_map = cpu_sibling_map[i];
cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
if (i != first_cpu(this_sibling_map))
@@ -4509,16 +4285,6 @@ __init static void arch_init_sched_domains(void)
}
#endif
- /* Set up isolated groups */
- for_each_cpu_mask(i, cpu_isolated_map) {
- cpumask_t mask;
- cpus_clear(mask);
- cpu_set(i, mask);
- init_sched_build_groups(sched_group_isolated, mask,
- &cpu_to_isolated_group);
- }
-
-#ifdef CONFIG_NUMA
/* Set up physical groups */
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
@@ -4530,10 +4296,6 @@ __init static void arch_init_sched_domains(void)
init_sched_build_groups(sched_group_phys, nodemask,
&cpu_to_phys_group);
}
-#else
- init_sched_build_groups(sched_group_phys, cpu_possible_map,
- &cpu_to_phys_group);
-#endif
#ifdef CONFIG_NUMA
/* Set up node groups */
@@ -4566,7 +4328,7 @@ __init static void arch_init_sched_domains(void)
}
/* Attach the domains */
- for_each_cpu(i) {
+ for_each_online_cpu(i) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
@@ -4577,21 +4339,29 @@ __init static void arch_init_sched_domains(void)
}
}
-#undef SCHED_DOMAIN_DEBUG
+#ifdef CONFIG_HOTPLUG_CPU
+static void __devinit arch_destroy_sched_domains(void)
+{
+ /* Do nothing: everything is statically allocated. */
+}
+#endif
+
+#endif /* ARCH_HAS_SCHED_DOMAIN */
+
+#define SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
-void sched_domain_debug(void)
+static void sched_domain_debug(void)
{
int i;
- for_each_cpu(i) {
+ for_each_online_cpu(i) {
runqueue_t *rq = cpu_rq(i);
struct sched_domain *sd;
int level = 0;
sd = rq->sd;
- printk(KERN_DEBUG "CPU%d: %s\n",
- i, (cpu_online(i) ? " online" : "offline"));
+ printk(KERN_DEBUG "CPU%d:\n", i);
do {
int j;
@@ -4605,7 +4375,17 @@ void sched_domain_debug(void)
printk(KERN_DEBUG);
for (j = 0; j < level + 1; j++)
printk(" ");
- printk("domain %d: span %s\n", level, str);
+ printk("domain %d: ", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not balance");
+ if (sd->parent)
+ printk(" ERROR !SD_LOAD_BALANCE domain has parent");
+ printk("\n");
+ break;
+ }
+
+ printk("span %s\n", str);
if (!cpu_isset(i, sd->span))
printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
@@ -4657,10 +4437,64 @@ void sched_domain_debug(void)
#define sched_domain_debug() {}
#endif
+#ifdef CONFIG_SMP
+/*
+ * Initial dummy domain for early boot and for hotplug cpu. Being static,
+ * it is initialized to zero, so all balancing flags are cleared which is
+ * what we want.
+ */
+static struct sched_domain sched_domain_dummy;
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy. The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to a "dummy" domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int i;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_PREPARE:
+ for_each_online_cpu(i)
+ cpu_attach_domain(&sched_domain_dummy, i);
+ arch_destroy_sched_domains();
+ return NOTIFY_OK;
+
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ /*
+ * Fall through and re-initialise the domains.
+ */
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ /* The hotplug lock is already held by cpu_up/cpu_down */
+ arch_init_sched_domains();
+
+ sched_domain_debug();
+
+ return NOTIFY_OK;
+}
+#endif
+
void __init sched_init_smp(void)
{
+ lock_cpu_hotplug();
arch_init_sched_domains();
sched_domain_debug();
+ unlock_cpu_hotplug();
+ /* XXX: Theoretical race here - CPU may be hotplugged now */
+ hotcpu_notifier(update_sched_domains, 0);
}
#else
void __init sched_init_smp(void)
@@ -4682,24 +4516,6 @@ void __init sched_init(void)
runqueue_t *rq;
int i, j, k;
-#ifdef CONFIG_SMP
- /* Set up an initial dummy domain for early boot */
- static struct sched_domain sched_domain_init;
- static struct sched_group sched_group_init;
-
- memset(&sched_domain_init, 0, sizeof(struct sched_domain));
- sched_domain_init.span = CPU_MASK_ALL;
- sched_domain_init.groups = &sched_group_init;
- sched_domain_init.last_balance = jiffies;
- sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
- sched_domain_init.busy_factor = 1;
-
- memset(&sched_group_init, 0, sizeof(struct sched_group));
- sched_group_init.cpumask = CPU_MASK_ALL;
- sched_group_init.next = &sched_group_init;
- sched_group_init.cpu_power = SCHED_LOAD_SCALE;
-#endif
-
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;
@@ -4710,7 +4526,7 @@ void __init sched_init(void)
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
- rq->sd = &sched_domain_init;
+ rq->sd = &sched_domain_dummy;
rq->cpu_load = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index df2cd4216838..f67390806d73 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/security.h>
+#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <asm/param.h>
#include <asm/uaccess.h>
@@ -269,7 +270,7 @@ static struct sigqueue *__sigqueue_alloc(void)
struct sigqueue *q = NULL;
if (atomic_read(&current->user->sigpending) <
- current->rlim[RLIMIT_SIGPENDING].rlim_cur)
+ current->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
if (q) {
INIT_LIST_HEAD(&q->list);
@@ -764,7 +765,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
pass on the info struct. */
if (atomic_read(&t->user->sigpending) <
- t->rlim[RLIMIT_SIGPENDING].rlim_cur)
+ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
if (q) {
@@ -913,7 +914,7 @@ __group_complete_signal(int sig, struct task_struct *p)
* Don't bother zombies and stopped tasks (but
* SIGKILL will punch through stopped state)
*/
- mask = TASK_DEAD | TASK_ZOMBIE | TASK_TRACED;
+ mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED;
if (sig != SIGKILL)
mask |= TASK_STOPPED;
@@ -1069,7 +1070,7 @@ void zap_other_threads(struct task_struct *p)
/*
* Don't bother with already dead threads
*/
- if (t->state & (TASK_ZOMBIE|TASK_DEAD))
+ if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD))
continue;
/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4a3da9be9f26..0e1e1356e35a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,11 +137,17 @@ EXPORT_SYMBOL(do_softirq);
void local_bh_enable(void)
{
- __local_bh_enable();
WARN_ON(irqs_disabled());
- if (unlikely(!in_interrupt() &&
- local_softirq_pending()))
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ preempt_count() -= SOFTIRQ_OFFSET - 1;
+
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
invoke_softirq();
+
+ dec_preempt_count();
preempt_check_resched();
}
EXPORT_SYMBOL(local_bh_enable);
diff --git a/kernel/sys.c b/kernel/sys.c
index 571bba8c8989..a95e3900dc1e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -24,6 +24,11 @@
#include <linux/dcookies.h>
#include <linux/suspend.h>
+/* Don't include this - it breaks ia64's cond_syscall() implementation */
+#if 0
+#include <linux/syscalls.h>
+#endif
+
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>
@@ -649,7 +654,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
return -EAGAIN;
if (atomic_read(&new_user->processes) >=
- current->rlim[RLIMIT_NPROC].rlim_cur &&
+ current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
new_user != &root_user) {
free_uid(new_user);
return -EAGAIN;
@@ -1496,9 +1501,13 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
{
if (resource >= RLIM_NLIMITS)
return -EINVAL;
- else
- return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim))
- ? -EFAULT : 0;
+ else {
+ struct rlimit value;
+ task_lock(current->group_leader);
+ value = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+ }
}
#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1513,7 +1522,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
if (resource >= RLIM_NLIMITS)
return -EINVAL;
- memcpy(&x, current->rlim + resource, sizeof(*rlim));
+ task_lock(current->group_leader);
+ x = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
if(x.rlim_cur > 0x7FFFFFFF)
x.rlim_cur = 0x7FFFFFFF;
if(x.rlim_max > 0x7FFFFFFF)
@@ -1534,21 +1545,20 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
return -EFAULT;
if (new_rlim.rlim_cur > new_rlim.rlim_max)
return -EINVAL;
- old_rlim = current->rlim + resource;
- if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
- (new_rlim.rlim_max > old_rlim->rlim_max)) &&
+ old_rlim = current->signal->rlim + resource;
+ if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (resource == RLIMIT_NOFILE) {
- if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
+ if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
return -EPERM;
- }
retval = security_task_setrlimit(resource, &new_rlim);
if (retval)
return retval;
+ task_lock(current->group_leader);
*old_rlim = new_rlim;
+ task_unlock(current->group_leader);
return 0;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a0af0ed9a8..469cf0c2f26e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -40,6 +40,7 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
diff --git a/kernel/time.c b/kernel/time.c
index 167a72a078ea..0b4797fa3d30 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,6 +31,7 @@
#include <linux/timex.h>
#include <linux/errno.h>
#include <linux/smp_lock.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
diff --git a/kernel/timer.c b/kernel/timer.c
index 36e40d8b182d..e3c9b5fcd52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -31,11 +31,13 @@
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/cpu.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
+#include <asm/io.h>
#ifdef CONFIG_TIME_INTERPOLATION
static void time_interpolator_update(long delta_nsec);
@@ -788,13 +790,12 @@ static void update_wall_time(unsigned long ticks)
do {
ticks--;
update_wall_time_one_tick();
+ if (xtime.tv_nsec >= 1000000000) {
+ xtime.tv_nsec -= 1000000000;
+ xtime.tv_sec++;
+ second_overflow();
+ }
} while (ticks);
-
- if (xtime.tv_nsec >= 1000000000) {
- xtime.tv_nsec -= 1000000000;
- xtime.tv_sec++;
- second_overflow();
- }
}
static inline void do_process_times(struct task_struct *p,
@@ -804,12 +805,13 @@ static inline void do_process_times(struct task_struct *p,
psecs = (p->utime += user);
psecs += (p->stime += system);
- if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) {
+ if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) &&
+ psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
/* Send SIGXCPU every second.. */
if (!(psecs % HZ))
send_sig(SIGXCPU, p, 1);
/* and SIGKILL when we go over max.. */
- if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
+ if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max)
send_sig(SIGKILL, p, 1);
}
}
@@ -1624,13 +1626,13 @@ EXPORT_SYMBOL(msleep);
*/
unsigned long msleep_interruptible(unsigned int msecs)
{
- unsigned long timeout = msecs_to_jiffies(msecs);
+ unsigned long timeout = msecs_to_jiffies(msecs);
- while (timeout && !signal_pending(current)) {
- set_current_state(TASK_INTERRUPTIBLE);
- timeout = schedule_timeout(timeout);
- }
- return jiffies_to_msecs(timeout);
+ while (timeout && !signal_pending(current)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ timeout = schedule_timeout(timeout);
+ }
+ return jiffies_to_msecs(timeout);
}
EXPORT_SYMBOL(msleep_interruptible);