diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 1 | ||||
| -rw-r--r-- | kernel/acct.c | 17 | ||||
| -rw-r--r-- | kernel/capability.c | 1 | ||||
| -rw-r--r-- | kernel/cpu.c | 21 | ||||
| -rw-r--r-- | kernel/exec_domain.c | 1 | ||||
| -rw-r--r-- | kernel/exit.c | 205 | ||||
| -rw-r--r-- | kernel/fork.c | 29 | ||||
| -rw-r--r-- | kernel/irq/Makefile | 4 | ||||
| -rw-r--r-- | kernel/irq/autoprobe.c | 188 | ||||
| -rw-r--r-- | kernel/irq/handle.c | 204 | ||||
| -rw-r--r-- | kernel/irq/internals.h | 18 | ||||
| -rw-r--r-- | kernel/irq/manage.c | 347 | ||||
| -rw-r--r-- | kernel/irq/proc.c | 156 | ||||
| -rw-r--r-- | kernel/irq/spurious.c | 96 | ||||
| -rw-r--r-- | kernel/itimer.c | 1 | ||||
| -rw-r--r-- | kernel/kprobes.c | 3 | ||||
| -rw-r--r-- | kernel/panic.c | 1 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 6 | ||||
| -rw-r--r-- | kernel/power/console.c | 3 | ||||
| -rw-r--r-- | kernel/power/disk.c | 18 | ||||
| -rw-r--r-- | kernel/power/process.c | 4 | ||||
| -rw-r--r-- | kernel/power/swsusp.c | 29 | ||||
| -rw-r--r-- | kernel/printk.c | 1 | ||||
| -rw-r--r-- | kernel/profile.c | 4 | ||||
| -rw-r--r-- | kernel/ptrace.c | 7 | ||||
| -rw-r--r-- | kernel/sched.c | 578 | ||||
| -rw-r--r-- | kernel/signal.c | 9 | ||||
| -rw-r--r-- | kernel/softirq.c | 12 | ||||
| -rw-r--r-- | kernel/sys.c | 32 | ||||
| -rw-r--r-- | kernel/sysctl.c | 1 | ||||
| -rw-r--r-- | kernel/time.c | 1 | ||||
| -rw-r--r-- | kernel/timer.c | 30 |
32 files changed, 1512 insertions, 516 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 93155e52bb54..c4337751cee7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ ifneq ($(CONFIG_IA64),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index daf23c4efab4..15ab0324139d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -53,6 +53,7 @@ #include <linux/vfs.h> #include <linux/jiffies.h> #include <linux/times.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ @@ -384,6 +385,8 @@ static void do_acct_process(long exitcode, struct file *file) unsigned long vsize; unsigned long flim; u64 elapsed; + u64 run_time; + struct timespec uptime; /* * First check to see if there is enough free_space to continue @@ -401,7 +404,13 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); - elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time); + /* calculate run_time in nsec*/ + do_posix_clock_monotonic_gettime(&uptime); + run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; + run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC + + current->start_time.tv_nsec; + /* convert nsec -> AHZ */ + elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 ac.ac_etime = encode_float(elapsed); #else @@ -480,11 +489,11 @@ static void do_acct_process(long exitcode, struct file *file) /* * Accounting records are not subject to resource limits. */ - flim = current->rlim[RLIMIT_FSIZE].rlim_cur; - current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; file->f_op->write(file, (char *)&ac, sizeof(acct_t), &file->f_pos); - current->rlim[RLIMIT_FSIZE].rlim_cur = flim; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); } diff --git a/kernel/capability.c b/kernel/capability.c index 7e864e2ccf6a..7800a5066c0f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/security.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 083521327e64..719ba42ff5ee 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -61,13 +61,13 @@ static inline void check_for_tasks(int cpu) * cpu' with certain environment variables set. */ static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) { - char *argv[3], *envp[5], cpu_str[12], action_str[32]; + char *argv[3], *envp[6], cpu_str[12], action_str[32], devpath_str[40]; int i; sprintf(cpu_str, "CPU=%d", cpu); sprintf(action_str, "ACTION=%s", action); - /* FIXME: Add DEVPATH. --RR */ - + sprintf(devpath_str, "DEVPATH=devices/system/cpu/cpu%d", cpu); + i = 0; argv[i++] = hotplug_path; argv[i++] = "cpu"; @@ -79,6 +79,7 @@ static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i++] = cpu_str; envp[i++] = action_str; + envp[i++] = devpath_str; envp[i] = NULL; return call_usermodehelper(argv[0], argv, envp, 0); @@ -123,6 +124,15 @@ int cpu_down(unsigned int cpu) goto out; } + err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + (void *)(long)cpu); + if (err == NOTIFY_BAD) { + printk("%s: attempt to take down CPU %u failed\n", + __FUNCTION__, cpu); + err = -EINVAL; + goto out; + } + /* Ensure that we are not runnable on dying cpu */ old_allowed = current->cpus_allowed; tmp = CPU_MASK_ALL; @@ -131,6 +141,11 @@ int cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); if (IS_ERR(p)) { + /* CPU didn't die: tell everyone. Can't complain. */ + if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + (void *)(long)cpu) == NOTIFY_BAD) + BUG(); + err = PTR_ERR(p); goto out_allowed; } diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 5fba35304459..ad3e5d54e119 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/personality.h> #include <linux/sched.h> +#include <linux/syscalls.h> #include <linux/sysctl.h> #include <linux/types.h> diff --git a/kernel/exit.c b/kernel/exit.c index 6860b509dd11..55d853392524 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -24,6 +24,7 @@ #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/mempolicy.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -75,7 +76,7 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) { + if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { BUG_ON(leader->exit_signal == -1); do_notify_parent(leader, leader->exit_signal); /* @@ -157,7 +158,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task - || p->state >= TASK_ZOMBIE + || p->exit_state >= EXIT_ZOMBIE || p->real_parent->pid == 1) continue; if (process_group(p->real_parent) != pgrp @@ -237,7 +238,8 @@ void reparent_to_init(void) /* rt_priority? */ /* signals? */ security_task_reparent_to_init(current); - memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + memcpy(current->signal->rlim, init_task.signal->rlim, + sizeof(current->signal->rlim)); atomic_inc(&(INIT_USER->__count)); switch_uid(INIT_USER); @@ -517,7 +519,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re * Make sure we're not reparenting to ourselves and that * the parent is not a zombie. */ - BUG_ON(p == reaper || reaper->state >= TASK_ZOMBIE); + BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE); p->real_parent = reaper; if (p->parent == p->real_parent) BUG(); @@ -552,7 +554,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) /* If we'd notified the old parent about this child's death, * also notify the new parent. */ - if (p->state == TASK_ZOMBIE && p->exit_signal != -1 && + if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); else if (p->state == TASK_TRACED) { @@ -600,7 +602,7 @@ static inline void forget_original_parent(struct task_struct * father, reaper = child_reaper; break; } - } while (reaper->state >= TASK_ZOMBIE); + } while (reaper->exit_state >= EXIT_ZOMBIE); /* * There are only two places where our children can be: @@ -626,7 +628,7 @@ static inline void forget_original_parent(struct task_struct * father, } else { /* reparent ptraced task to its real parent */ __ptrace_unlink (p); - if (p->state == TASK_ZOMBIE && p->exit_signal != -1 && + if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); } @@ -637,7 +639,7 @@ static inline void forget_original_parent(struct task_struct * father, * zombie forever since we prevented it from self-reap itself * while it was being traced by us, to be able to see it in wait4. */ - if (unlikely(ptrace && p->state == TASK_ZOMBIE && p->exit_signal == -1)) + if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) list_add(&p->ptrace_list, to_release); } list_for_each_safe(_p, _n, &father->ptrace_children) { @@ -750,10 +752,10 @@ static void exit_notify(struct task_struct *tsk) do_notify_parent(tsk, SIGCHLD); } - state = TASK_ZOMBIE; + state = EXIT_ZOMBIE; if (tsk->exit_signal == -1 && tsk->ptrace == 0) - state = TASK_DEAD; - tsk->state = state; + state = EXIT_DEAD; + tsk->exit_state = state; /* * Clear these here so that update_process_times() won't try to deliver @@ -761,7 +763,6 @@ static void exit_notify(struct task_struct *tsk) */ tsk->it_virt_value = 0; tsk->it_prof_value = 0; - tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY; write_unlock_irq(&tasklist_lock); @@ -772,7 +773,7 @@ static void exit_notify(struct task_struct *tsk) } /* If the process is dead, release it - nobody will wait for it */ - if (state == TASK_DEAD) + if (state == EXIT_DEAD) release_task(tsk); /* PF_DEAD causes final put_task_struct after we schedule. */ @@ -829,6 +830,8 @@ asmlinkage NORET_TYPE void do_exit(long code) mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif + + BUG_ON(!(current->flags & PF_DEAD)); schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -972,7 +975,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, } /* - * Handle sys_wait4 work for one task in state TASK_ZOMBIE. We hold + * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. @@ -991,7 +994,7 @@ static int wait_task_zombie(task_t *p, int noreap, int exit_code = p->exit_code; int why, status; - if (unlikely(p->state != TASK_ZOMBIE)) + if (unlikely(p->exit_state != EXIT_ZOMBIE)) return 0; if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) return 0; @@ -1012,9 +1015,9 @@ static int wait_task_zombie(task_t *p, int noreap, * Try to move the task's state to DEAD * only one thread is allowed to do this: */ - state = xchg(&p->state, TASK_DEAD); - if (state != TASK_ZOMBIE) { - BUG_ON(state != TASK_DEAD); + state = xchg(&p->exit_state, EXIT_DEAD); + if (state != EXIT_ZOMBIE) { + BUG_ON(state != EXIT_DEAD); return 0; } if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) { @@ -1059,7 +1062,7 @@ static int wait_task_zombie(task_t *p, int noreap, /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to TASK_DEAD. + * thread can reap it because we set its state to EXIT_DEAD. */ read_unlock(&tasklist_lock); @@ -1091,7 +1094,8 @@ static int wait_task_zombie(task_t *p, int noreap, if (!retval && infop) retval = put_user(p->uid, &infop->si_uid); if (retval) { - p->state = TASK_ZOMBIE; + // TODO: is this safe? + p->exit_state = EXIT_ZOMBIE; return retval; } retval = p->pid; @@ -1100,7 +1104,8 @@ static int wait_task_zombie(task_t *p, int noreap, /* Double-check with lock held. */ if (p->real_parent != p->parent) { __ptrace_unlink(p); - p->state = TASK_ZOMBIE; + // TODO: is this safe? + p->exit_state = EXIT_ZOMBIE; /* * If this is not a detached task, notify the parent. * If it's still not detached after that, don't release @@ -1171,13 +1176,13 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, /* * This uses xchg to be atomic with the thread resuming and setting * it. It must also be done with the write lock held to prevent a - * race with the TASK_ZOMBIE case. + * race with the EXIT_ZOMBIE case. */ exit_code = xchg(&p->exit_code, 0); - if (unlikely(p->state >= TASK_ZOMBIE)) { + if (unlikely(p->exit_state >= EXIT_ZOMBIE)) { /* * The task resumed and then died. Let the next iteration - * catch it in TASK_ZOMBIE. Note that exit_code might + * catch it in EXIT_ZOMBIE. Note that exit_code might * already be zero here if it resumed and did _exit(0). * The task itself is dead and won't touch exit_code again; * other processors in this function are locked out. @@ -1228,6 +1233,74 @@ bail_ref: return retval; } +/* + * Handle do_wait work for one task in a live, non-stopped state. + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_continued(task_t *p, int noreap, + struct siginfo __user *infop, + int __user *stat_addr, struct rusage __user *ru) +{ + int retval; + pid_t pid; + uid_t uid; + + if (unlikely(!p->signal)) + return 0; + + if (p->signal->stop_state >= 0) + return 0; + + spin_lock_irq(&p->sighand->siglock); + if (p->signal->stop_state >= 0) { /* Re-check with the lock held. */ + spin_unlock_irq(&p->sighand->siglock); + return 0; + } + if (!noreap) + p->signal->stop_state = 0; + spin_unlock_irq(&p->sighand->siglock); + + pid = p->pid; + uid = p->uid; + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!infop) { + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + put_task_struct(p); + if (!retval && stat_addr) + retval = put_user(0xffff, stat_addr); + if (!retval) + retval = p->pid; + } else { + retval = wait_noreap_copyout(p, pid, uid, + CLD_CONTINUED, SIGCONT, + infop, ru); + BUG_ON(retval == 0); + } + + return retval; +} + + +static inline int my_ptrace_child(struct task_struct *p) +{ + if (!(p->ptrace & PT_PTRACED)) + return 0; + if (!(p->ptrace & PT_ATTACHED)) + return 1; + /* + * This child was PTRACE_ATTACH'd. We should be seeing it only if + * we are the attacher. If we are the real parent, this is a race + * inside ptrace_attach. It is waiting for the tasklist_lock, + * which we have to switch the parent links, but has already set + * the flags in p->ptrace. + */ + return (p->parent != p->real_parent); +} + static long do_wait(pid_t pid, int options, struct siginfo __user *infop, int __user *stat_addr, struct rusage __user *ru) { @@ -1256,12 +1329,12 @@ repeat: switch (p->state) { case TASK_TRACED: - if (!(p->ptrace & PT_PTRACED)) + if (!my_ptrace_child(p)) continue; /*FALLTHROUGH*/ case TASK_STOPPED: if (!(options & WUNTRACED) && - !(p->ptrace & PT_PTRACED)) + !my_ptrace_child(p)) continue; retval = wait_task_stopped(p, ret == 2, (options & WNOWAIT), @@ -1270,47 +1343,36 @@ repeat: if (retval != 0) /* He released the lock. */ goto end; break; - case TASK_ZOMBIE: - /* - * Eligible but we cannot release it yet: - */ - if (ret == 2) - goto check_continued; - if (!likely(options & WEXITED)) - continue; - retval = wait_task_zombie( - p, (options & WNOWAIT), - infop, stat_addr, ru); - if (retval != 0) /* He released the lock. */ - goto end; - break; - case TASK_DEAD: - continue; default: + // case EXIT_DEAD: + if (p->exit_state == EXIT_DEAD) + continue; + // case EXIT_ZOMBIE: + if (p->exit_state == EXIT_ZOMBIE) { + /* + * Eligible but we cannot release + * it yet: + */ + if (ret == 2) + goto check_continued; + if (!likely(options & WEXITED)) + continue; + retval = wait_task_zombie( + p, (options & WNOWAIT), + infop, stat_addr, ru); + /* He released the lock. */ + if (retval != 0) + goto end; + break; + } check_continued: if (!unlikely(options & WCONTINUED)) continue; - if (unlikely(!p->signal)) - continue; - spin_lock_irq(&p->sighand->siglock); - if (p->signal->stop_state < 0) { - pid_t pid; - uid_t uid; - - if (!(options & WNOWAIT)) - p->signal->stop_state = 0; - spin_unlock_irq(&p->sighand->siglock); - pid = p->pid; - uid = p->uid; - get_task_struct(p); - read_unlock(&tasklist_lock); - retval = wait_noreap_copyout(p, pid, - uid, CLD_CONTINUED, - SIGCONT, infop, ru); - BUG_ON(retval == 0); + retval = wait_task_continued( + p, (options & WNOWAIT), + infop, stat_addr, ru); + if (retval != 0) /* He released the lock. */ goto end; - } - spin_unlock_irq(&p->sighand->siglock); break; } } @@ -1376,6 +1438,8 @@ asmlinkage long sys_waitid(int which, pid_t pid, struct siginfo __user *infop, int options, struct rusage __user *ru) { + long ret; + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) @@ -1398,15 +1462,26 @@ asmlinkage long sys_waitid(int which, pid_t pid, return -EINVAL; } - return do_wait(pid, options, infop, NULL, ru); + ret = do_wait(pid, options, infop, NULL, ru); + + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, int options, struct rusage __user *ru) { - if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) + long ret; + + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; - return do_wait(pid, options | WEXITED, NULL, stat_addr, ru); + ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru); + + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } #ifdef __ARCH_WANT_SYS_WAITPID diff --git a/kernel/fork.c b/kernel/fork.c index 8c7ba481c9a5..3020dccc548f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,7 +86,7 @@ EXPORT_SYMBOL(free_task); void __put_task_struct(struct task_struct *tsk) { - WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE))); + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); @@ -249,8 +249,8 @@ void __init fork_init(unsigned long mempages) if(max_threads < 20) max_threads = 20; - init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; - init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -762,8 +762,17 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; - if (f) + if (f) { get_file(f); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + FD_CLR(open_files - i, newf->open_fds); + } *new_fds++ = f; } spin_unlock(&oldf->file_lock); @@ -872,6 +881,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + task_lock(current->group_leader); + memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); + task_unlock(current->group_leader); + return 0; } @@ -941,7 +954,7 @@ static task_t *copy_process(unsigned long clone_flags, retval = -EAGAIN; if (atomic_read(&p->user->processes) >= - p->rlim[RLIMIT_NPROC].rlim_cur) { + p->signal->rlim[RLIMIT_NPROC].rlim_cur) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->user != &root_user) goto bad_fork_free; @@ -992,7 +1005,7 @@ static task_t *copy_process(unsigned long clone_flags, p->utime = p->stime = 0; p->lock_depth = -1; /* -1 = no lock */ - p->start_time = get_jiffies_64(); + do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; p->io_context = NULL; p->io_wait = NULL; @@ -1049,6 +1062,7 @@ static task_t *copy_process(unsigned long clone_flags, /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; + p->exit_state = 0; /* Perform scheduler related setup */ sched_fork(p); @@ -1146,7 +1160,8 @@ fork_out: bad_fork_cleanup_namespace: exit_namespace(p); bad_fork_cleanup_mm: - mmput(p->mm); + if (p->mm) + mmput(p->mm); bad_fork_cleanup_signal: exit_signal(p); bad_fork_cleanup_sighand: diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile new file mode 100644 index 000000000000..66d3be6e4da8 --- /dev/null +++ b/kernel/irq/Makefile @@ -0,0 +1,4 @@ + +obj-y := autoprobe.o handle.o manage.o spurious.o +obj-$(CONFIG_PROC_FS) += proc.o + diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c new file mode 100644 index 000000000000..16818726cd21 --- /dev/null +++ b/kernel/irq/autoprobe.c @@ -0,0 +1,188 @@ +/* + * linux/kernel/irq/autoprobe.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the interrupt probing code and driver APIs. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/interrupt.h> + +/* + * Autodetection depends on the fact that any interrupt that + * comes in on to an unassigned handler will get stuck with + * "IRQ_WAITING" cleared and the interrupt disabled. + */ +static DECLARE_MUTEX(probe_sem); + +/** + * probe_irq_on - begin an interrupt autodetect + * + * Commence probing for an interrupt. The interrupts are scanned + * and a mask of potential interrupt lines is returned. + * + */ +unsigned long probe_irq_on(void) +{ + unsigned long val, delay; + irq_desc_t *desc; + unsigned int i; + + down(&probe_sem); + /* + * something may have generated an irq long ago and we want to + * flush such a longstanding irq before considering it as spurious. + */ + for (i = NR_IRQS-1; i > 0; i--) { + desc = irq_desc + i; + + spin_lock_irq(&desc->lock); + if (!irq_desc[i].action) + irq_desc[i].handler->startup(i); + spin_unlock_irq(&desc->lock); + } + + /* Wait for longstanding interrupts to trigger. */ + for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) + /* about 20ms delay */ barrier(); + + /* + * enable any unassigned irqs + * (we must startup again here because if a longstanding irq + * happened in the previous stage, it may have masked itself) + */ + for (i = NR_IRQS-1; i > 0; i--) { + desc = irq_desc + i; + + spin_lock_irq(&desc->lock); + if (!desc->action) { + desc->status |= IRQ_AUTODETECT | IRQ_WAITING; + if (desc->handler->startup(i)) + desc->status |= IRQ_PENDING; + } + spin_unlock_irq(&desc->lock); + } + + /* + * Wait for spurious interrupts to trigger + */ + for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) + /* about 100ms delay */ barrier(); + + /* + * Now filter out any obviously spurious interrupts + */ + val = 0; + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + unsigned int status; + + spin_lock_irq(&desc->lock); + status = desc->status; + + if (status & IRQ_AUTODETECT) { + /* It triggered already - consider it spurious. */ + if (!(status & IRQ_WAITING)) { + desc->status = status & ~IRQ_AUTODETECT; + desc->handler->shutdown(i); + } else + if (i < 32) + val |= 1 << i; + } + spin_unlock_irq(&desc->lock); + } + + return val; +} + +EXPORT_SYMBOL(probe_irq_on); + +/** + * probe_irq_mask - scan a bitmap of interrupt lines + * @val: mask of interrupts to consider + * + * Scan the interrupt lines and return a bitmap of active + * autodetect interrupts. The interrupt probe logic state + * is then returned to its previous value. + * + * Note: we need to scan all the irq's even though we will + * only return autodetect irq numbers - just so that we reset + * them all to a known state. + */ +unsigned int probe_irq_mask(unsigned long val) +{ + unsigned int mask; + int i; + + mask = 0; + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + unsigned int status; + + spin_lock_irq(&desc->lock); + status = desc->status; + + if (status & IRQ_AUTODETECT) { + if (i < 16 && !(status & IRQ_WAITING)) + mask |= 1 << i; + + desc->status = status & ~IRQ_AUTODETECT; + desc->handler->shutdown(i); + } + spin_unlock_irq(&desc->lock); + } + up(&probe_sem); + + return mask & val; +} + +/** + * probe_irq_off - end an interrupt autodetect + * @val: mask of potential interrupts (unused) + * + * Scans the unused interrupt lines and returns the line which + * appears to have triggered the interrupt. If no interrupt was + * found then zero is returned. If more than one interrupt is + * found then minus the first candidate is returned to indicate + * their is doubt. + * + * The interrupt probe logic state is returned to its previous + * value. + * + * BUGS: When used in a module (which arguably shouldn't happen) + * nothing prevents two IRQ probe callers from overlapping. The + * results of this are non-optimal. + */ +int probe_irq_off(unsigned long val) +{ + int i, irq_found = 0, nr_irqs = 0; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + unsigned int status; + + spin_lock_irq(&desc->lock); + status = desc->status; + + if (status & IRQ_AUTODETECT) { + if (!(status & IRQ_WAITING)) { + if (!nr_irqs) + irq_found = i; + nr_irqs++; + } + desc->status = status & ~IRQ_AUTODETECT; + desc->handler->shutdown(i); + } + spin_unlock_irq(&desc->lock); + } + up(&probe_sem); + + if (nr_irqs > 1) + irq_found = -irq_found; + return irq_found; +} + +EXPORT_SYMBOL(probe_irq_off); + diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c new file mode 100644 index 000000000000..6f0ebc254e3e --- /dev/null +++ b/kernel/irq/handle.c @@ -0,0 +1,204 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the core interrupt handling code. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +#include "internals.h" + +/* + * Linux has a controller-independent interrupt architecture. + * Every controller has a 'controller-template', that is used + * by the main code to do the right thing. Each driver-visible + * interrupt source is transparently wired to the apropriate + * controller. Thus drivers need not be aware of the + * interrupt-controller. + * + * The code is designed to be easily extended with new/different + * interrupt controllers, without having to do assembly magic or + * having to touch the generic code. + * + * Controller mappings for all interrupt sources: + */ +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { + [0 ... NR_IRQS-1] = { + .handler = &no_irq_type, + .lock = SPIN_LOCK_UNLOCKED + } +}; + +/* + * Generic 'no controller' code + */ +static void end_none(unsigned int irq) { } +static void enable_none(unsigned int irq) { } +static void disable_none(unsigned int irq) { } +static void shutdown_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } + +static void ack_none(unsigned int irq) +{ + /* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themself. + */ + ack_bad_irq(irq); +} + +struct hw_interrupt_type no_irq_type = { + .typename = "none", + .startup = startup_none, + .shutdown = shutdown_none, + .enable = enable_none, + .disable = disable_none, + .ack = ack_none, + .end = end_none, + .set_affinity = NULL +}; + +/* + * Special, empty irq handler: + */ +irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) +{ + return IRQ_NONE; +} + +/* + * Exit an interrupt context. Process softirqs if needed and possible: + */ +void irq_exit(void) +{ + preempt_count() -= IRQ_EXIT_OFFSET; + if (!in_interrupt() && local_softirq_pending()) + do_softirq(); + preempt_enable_no_resched(); +} + +/* + * Have got an event to handle: + */ +asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, + struct irqaction *action) +{ + int ret, retval = 0, status = 0; + + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + + do { + ret = action->handler(irq, action->dev_id, regs); + if (ret == IRQ_HANDLED) + status |= action->flags; + retval |= ret; + action = action->next; + } while (action); + + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); + local_irq_disable(); + + return retval; +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +asmlinkage unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) +{ + irq_desc_t *desc = irq_desc + irq; + struct irqaction * action; + unsigned int status; + + kstat_this_cpu.irqs[irq]++; + if (desc->status & IRQ_PER_CPU) { + irqreturn_t action_ret; + + /* + * No locking required for CPU-local interrupts: + */ + desc->handler->ack(irq); + action_ret = handle_IRQ_event(irq, regs, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + desc->handler->end(irq); + return 1; + } + + spin_lock(&desc->lock); + desc->handler->ack(irq); + /* + * REPLAY is when Linux resends an IRQ that was dropped earlier + * WAITING is used by probe to mark irqs that are being tested + */ + status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); + status |= IRQ_PENDING; /* we _want_ to handle it */ + + /* + * If the IRQ is disabled for whatever reason, we cannot + * use the action we have. + */ + action = NULL; + if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { + action = desc->action; + status &= ~IRQ_PENDING; /* we commit to handling */ + status |= IRQ_INPROGRESS; /* we are handling it */ + } + desc->status = status; + + /* + * If there is no IRQ handler or it was disabled, exit early. + * Since we set PENDING, if another processor is handling + * a different instance of this same irq, the other processor + * will take care of it. + */ + if (unlikely(!action)) + goto out; + + /* + * Edge triggered interrupts need to remember + * pending events. + * This applies to any hw interrupts that allow a second + * instance of the same irq to arrive while we are in do_IRQ + * or in the handler. But the code here only handles the _second_ + * instance of the irq, not the third or fourth. So it is mostly + * useful for irq hardware that does not mask cleanly in an + * SMP environment. + */ + for (;;) { + irqreturn_t action_ret; + + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + + spin_lock(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + if (likely(!(desc->status & IRQ_PENDING))) + break; + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + +out: + /* + * The ->end() handler has to deal with interrupts which got + * disabled while the handler was running. + */ + desc->handler->end(irq); + spin_unlock(&desc->lock); + + return 1; +} + diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h new file mode 100644 index 000000000000..46feba630266 --- /dev/null +++ b/kernel/irq/internals.h @@ -0,0 +1,18 @@ +/* + * IRQ subsystem internal functions and variables: + */ + +extern int noirqdebug; + +#ifdef CONFIG_PROC_FS +extern void register_irq_proc(unsigned int irq); +extern void register_handler_proc(unsigned int irq, struct irqaction *action); +extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); +#else +static inline void register_irq_proc(unsigned int irq) { } +static inline void register_handler_proc(unsigned int irq, + struct irqaction *action) { } +static inline void unregister_handler_proc(unsigned int irq, + struct irqaction *action) { } +#endif + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c new file mode 100644 index 000000000000..a164fe243338 --- /dev/null +++ b/kernel/irq/manage.c @@ -0,0 +1,347 @@ +/* + * linux/kernel/irq/manage.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains driver APIs to the irq subsystem. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> + +#include "internals.h" + +#ifdef CONFIG_SMP + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); +} + +EXPORT_SYMBOL(synchronize_irq); + +#endif + +/** + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and Enables are + * nested. + * Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + * + * This function may be called from IRQ context. + */ +void disable_irq_nosync(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->depth++) { + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +EXPORT_SYMBOL(disable_irq_nosync); + +/** + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are + * nested. + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void disable_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + + disable_irq_nosync(irq); + if (desc->action) + synchronize_irq(irq); +} + +EXPORT_SYMBOL(disable_irq); + +/** + * enable_irq - enable handling of an irq + * @irq: Interrupt to enable + * + * Undoes the effect of one call to disable_irq(). If this + * matches the last disable, processing of interrupts on this + * IRQ line is re-enabled. + * + * This function may be called from IRQ context. + */ +void enable_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + switch (desc->depth) { + case 0: + WARN_ON(1); + break; + case 1: { + unsigned int status = desc->status & ~IRQ_DISABLED; + + desc->status = status; + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status = status | IRQ_REPLAY; + hw_resend_irq(desc->handler,irq); + } + desc->handler->enable(irq); + /* fall-through */ + } + default: + desc->depth--; + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +EXPORT_SYMBOL(enable_irq); + +/* + * Internal function that tells the architecture code whether a + * particular irq has been exclusively allocated or is available + * for driver use. + */ +int can_request_irq(unsigned int irq, unsigned long irqflags) +{ + struct irqaction *action; + + if (irq >= NR_IRQS) + return 0; + + action = irq_desc[irq].action; + if (action) + if (irqflags & action->flags & SA_SHIRQ) + action = NULL; + + return !action; +} + +/* + * Internal function to register an irqaction - typically used to + * allocate special interrupts that are part of the architecture. + */ +int setup_irq(unsigned int irq, struct irqaction * new) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *old, **p; + unsigned long flags; + int shared = 0; + + if (desc->handler == &no_irq_type) + return -ENOSYS; + /* + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. + */ + if (new->flags & SA_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); + } + + /* + * The following block of code has to be executed atomically + */ + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + if ((old = *p) != NULL) { + /* Can't share interrupts unless both agree to */ + if (!(old->flags & new->flags & SA_SHIRQ)) { + spin_unlock_irqrestore(&desc->lock,flags); + return -EBUSY; + } + + /* add new interrupt at end of irq queue */ + do { + p = &old->next; + old = *p; + } while (old); + shared = 1; + } + + *p = new; + + if (!shared) { + desc->depth = 0; + desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | + IRQ_WAITING | IRQ_INPROGRESS); + if (desc->handler->startup) + desc->handler->startup(irq); + else + desc->handler->enable(irq); + } + spin_unlock_irqrestore(&desc->lock,flags); + + new->irq = irq; + register_irq_proc(irq); + new->dir = NULL; + register_handler_proc(irq, new); + + return 0; +} + +/** + * free_irq - free an interrupt + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function must not be called from interrupt context. + */ +void free_irq(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc; + struct irqaction **p; + unsigned long flags; + + if (irq >= NR_IRQS) + return; + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + for (;;) { + struct irqaction * action = *p; + + if (action) { + struct irqaction **pp = p; + + p = &action->next; + if (action->dev_id != dev_id) + continue; + + /* Found it - now remove it from the list of entries */ + *pp = action->next; + if (!desc->action) { + desc->status |= IRQ_DISABLED; + if (desc->handler->shutdown) + desc->handler->shutdown(irq); + else + desc->handler->disable(irq); + } + spin_unlock_irqrestore(&desc->lock,flags); + unregister_handler_proc(irq, action); + + /* Make sure it's not being used on another CPU */ + synchronize_irq(irq); + kfree(action); + return; + } + printk(KERN_ERR "Trying to free free IRQ%d\n",irq); + spin_unlock_irqrestore(&desc->lock,flags); + return; + } +} + +EXPORT_SYMBOL(free_irq); + +/** + * request_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. From the point this + * call is made your handler function may be invoked. Since + * your handler function must clear any interrupt the board + * raises, you must take care both to initialise your hardware + * and to set up the interrupt handler in the right order. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id + * as this is required when freeing the interrupt. + * + * Flags: + * + * SA_SHIRQ Interrupt is shared + * SA_INTERRUPT Disable local interrupts while processing + * SA_SAMPLE_RANDOM The interrupt can be used for entropy + * + */ +int request_irq(unsigned int irq, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, void *dev_id) +{ + struct irqaction * action; + int retval; + + /* + * Sanity-check: shared interrupts must pass in a real dev-ID, + * otherwise we'll have trouble later trying to figure out + * which interrupt is which (messes up the interrupt freeing + * logic etc). + */ + if ((irqflags & SA_SHIRQ) && !dev_id) + return -EINVAL; + if (irq >= NR_IRQS) + return -EINVAL; + if (!handler) + return -EINVAL; + + action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags; + cpus_clear(action->mask); + action->name = devname; + action->next = NULL; + action->dev_id = dev_id; + + retval = setup_irq(irq, action); + if (retval) + kfree(action); + + return retval; +} + +EXPORT_SYMBOL(request_irq); + diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c new file mode 100644 index 000000000000..2ddbe8404c9c --- /dev/null +++ b/kernel/irq/proc.c @@ -0,0 +1,156 @@ +/* + * linux/kernel/irq/proc.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the /proc/irq/ handling code. + */ + +#include <linux/irq.h> +#include <linux/proc_fs.h> +#include <linux/interrupt.h> + +static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; + +#ifdef CONFIG_SMP + +/* + * The /proc/irq/<irq>/smp_affinity values: + */ +static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; + +cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; + +static int irq_affinity_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); + + if (count - len < 2) + return -EINVAL; + len += sprintf(page + len, "\n"); + return len; +} + +int no_irq_affinity; +static int irq_affinity_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int irq = (int)(long)data, full_count = count, err; + cpumask_t new_value, tmp; + + if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) + return -EIO; + + err = cpumask_parse(buffer, count, new_value); + if (err) + return err; + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) + return -EINVAL; + + irq_affinity[irq] = new_value; + irq_desc[irq].handler->set_affinity(irq, + cpumask_of_cpu(first_cpu(new_value))); + + return full_count; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_dir[irq] || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_dir[irq]); +} + +#undef MAX_NAMELEN + +#define MAX_NAMELEN 10 + +void register_irq_proc(unsigned int irq) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || + (irq_desc[irq].handler == &no_irq_type) || + irq_dir[irq]) + return; + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%d", irq); + + /* create /proc/irq/1234 */ + irq_dir[irq] = proc_mkdir(name, root_irq_dir); + +#ifdef CONFIG_SMP + { + struct proc_dir_entry *entry; + + /* create /proc/irq/<irq>/smp_affinity */ + entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); + + if (entry) { + entry->nlink = 1; + entry->data = (void *)(long)irq; + entry->read_proc = irq_affinity_read_proc; + entry->write_proc = irq_affinity_write_proc; + } + smp_affinity_entry[irq] = entry; + } +#endif +} + +#undef MAX_NAMELEN + +void unregister_handler_proc(unsigned int irq, struct irqaction *action) +{ + if (action->dir) + remove_proc_entry(action->dir->name, irq_dir[irq]); +} + +void init_irq_proc(void) +{ + int i; + + /* create /proc/irq */ + root_irq_dir = proc_mkdir("irq", NULL); + if (!root_irq_dir) + return; + + /* + * Create entries for all existing IRQs. + */ + for (i = 0; i < NR_IRQS; i++) + register_irq_proc(i); +} + diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c new file mode 100644 index 000000000000..f6297c306905 --- /dev/null +++ b/kernel/irq/spurious.c @@ -0,0 +1,96 @@ +/* + * linux/kernel/irq/spurious.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains spurious interrupt handling. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/interrupt.h> + +/* + * If 99,900 of the previous 100,000 interrupts have not been handled + * then assume that the IRQ is stuck in some manner. Drop a diagnostic + * and try to turn the IRQ off. + * + * (The other 100-of-100,000 interrupts may have been a correctly + * functioning device sharing an IRQ with the failing one) + * + * Called under desc->lock + */ + +static void +__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + struct irqaction *action; + + if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { + printk(KERN_ERR "irq event %d: bogus return value %x\n", + irq, action_ret); + } else { + printk(KERN_ERR "irq %d: nobody cared!\n", irq); + } + dump_stack(); + printk(KERN_ERR "handlers:\n"); + action = desc->action; + while (action) { + printk(KERN_ERR "[<%p>]", action->handler); + print_symbol(" (%s)", + (unsigned long)action->handler); + printk("\n"); + action = action->next; + } +} + +void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + static int count = 100; + + if (count > 0) { + count--; + __report_bad_irq(irq, desc, action_ret); + } +} + +void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +{ + if (action_ret != IRQ_HANDLED) { + desc->irqs_unhandled++; + if (action_ret != IRQ_NONE) + report_bad_irq(irq, desc, action_ret); + } + + desc->irq_count++; + if (desc->irq_count < 100000) + return; + + desc->irq_count = 0; + if (desc->irqs_unhandled > 99900) { + /* + * The interrupt is stuck + */ + __report_bad_irq(irq, desc, action_ret); + /* + * Now kill the IRQ + */ + printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + desc->irqs_unhandled = 0; +} + +int noirqdebug; + +int __init noirqdebug_setup(char *str) +{ + noirqdebug = 1; + printk(KERN_INFO "IRQ lockup detection disabled\n"); + return 1; +} + +__setup("noirqdebug", noirqdebug_setup); + diff --git a/kernel/itimer.c b/kernel/itimer.c index 6918cb7460a8..95fbf1c6becf 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/smp_lock.h> #include <linux/interrupt.h> +#include <linux/syscalls.h> #include <linux/time.h> #include <asm/uaccess.h> diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 01436a31c690..ca4e28b4c6a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -25,6 +25,8 @@ * hlists and exceptions notifier as suggested by Andi Kleen. * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes * interface to access function arguments. + * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes + * exceptions notifier to be first on the priority list. */ #include <linux/kprobes.h> #include <linux/spinlock.h> @@ -108,6 +110,7 @@ void unregister_kprobe(struct kprobe *p) static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to notified first */ }; int register_jprobe(struct jprobe *jp) diff --git a/kernel/panic.c b/kernel/panic.c index b3abe97f88a6..fce7f4030d0a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -16,7 +16,6 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/sysrq.h> -#include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/nmi.h> diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ef5c42101748..c2dc4f89da41 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -43,6 +43,7 @@ #include <linux/compiler.h> #include <linux/idr.h> #include <linux/posix-timers.h> +#include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> @@ -219,11 +220,6 @@ static __init int init_posix_timers(void) .clock_set = do_posix_clock_monotonic_settime }; -#ifdef CONFIG_TIME_INTERPOLATION - /* Clocks are more accurate with time interpolators */ - clock_realtime.res = clock_monotonic.res = time_interpolator_resolution(); -#endif - register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); diff --git a/kernel/power/console.c b/kernel/power/console.c index 00b390d7a5ad..7ff375e7c95f 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -11,7 +11,9 @@ static int new_loglevel = 10; static int orig_loglevel; +#ifdef SUSPEND_CONSOLE static int orig_fgconsole, orig_kmsg; +#endif int pm_prepare_console(void) { @@ -50,6 +52,7 @@ void pm_restore_console(void) acquire_console_sem(); set_console(orig_fgconsole); release_console_sem(); + kmsg_redirect = orig_kmsg; #endif return; } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 312aa169c566..4cc81e03085c 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -85,10 +85,20 @@ static int in_suspend __nosavedata = 0; static void free_some_memory(void) { - printk("Freeing memory: "); - while (shrink_all_memory(10000)) - printk("."); - printk("|\n"); + unsigned int i = 0; + unsigned int tmp; + unsigned long pages = 0; + char *p = "-\\|/"; + + printk("Freeing memory... "); + while ((tmp = shrink_all_memory(10000))) { + pages += tmp; + printk("\b%c", p[i]); + i++; + if (i > 3) + i = 0; + } + printk("\bdone (%li pages freed)\n", pages); } diff --git a/kernel/power/process.c b/kernel/power/process.c index bda013de59a5..78d92dc6a1ed 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -23,8 +23,8 @@ static inline int freezeable(struct task_struct * p) { if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->state == TASK_ZOMBIE) || - (p->state == TASK_DEAD) || + (p->exit_state == EXIT_ZOMBIE) || + (p->exit_state == EXIT_DEAD) || (p->state == TASK_STOPPED) || (p->state == TASK_TRACED)) return 0; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f65d295478c4..375299c86c45 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -294,15 +294,19 @@ static int data_write(void) { int error = 0; int i; + unsigned int mod = nr_copy_pages / 100; - printk( "Writing data to swap (%d pages): ", nr_copy_pages ); + if (!mod) + mod = 1; + + printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); for (i = 0; i < nr_copy_pages && !error; i++) { - if (!(i%100)) - printk( "." ); + if (!(i%mod)) + printk( "\b\b\b\b%3d%%", i / mod ); error = write_page((pagedir_nosave+i)->address, &((pagedir_nosave+i)->swap_address)); } - printk(" %d Pages done.\n",i); + printk("\b\b\b\bdone\n"); return error; } @@ -856,7 +860,9 @@ int swsusp_suspend(void) local_irq_disable(); save_processor_state(); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ restore_processor_state(); + restore_highmem(); local_irq_enable(); return error; } @@ -876,8 +882,13 @@ int swsusp_resume(void) { int error; local_irq_disable(); + /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = swsusp_arch_resume(); + /* Code below is only ever reached in case of failure. Otherwise + * execution continues at place where swsusp_arch_suspend was called + */ + BUG_ON(!error); restore_processor_state(); restore_highmem(); local_irq_enable(); @@ -1134,14 +1145,18 @@ static int __init data_read(void) struct pbe * p; int error; int i; + int mod = nr_copy_pages / 100; + + if (!mod) + mod = 1; if ((error = swsusp_pagedir_relocate())) return error; - printk( "Reading image data (%d pages): ", nr_copy_pages ); + printk( "Reading image data (%d pages): ", nr_copy_pages ); for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) { - if (!(i%100)) - printk( "." ); + if (!(i%mod)) + printk( "\b\b\b\b%3d%%", i / mod ); error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); } diff --git a/kernel/printk.c b/kernel/printk.c index 18dba205c171..c02ec626f384 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -30,6 +30,7 @@ #include <linux/smp.h> #include <linux/security.h> #include <linux/bootmem.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> diff --git a/kernel/profile.c b/kernel/profile.c index cab14764cea0..1c4375fad923 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -22,14 +22,14 @@ static int __init profile_setup(char * str) int par; if (!strncmp(str, "schedule", 8)) { - prof_on = 2; + prof_on = SCHED_PROFILING; printk(KERN_INFO "kernel schedule profiling enabled\n"); if (str[7] == ',') str += 8; } if (get_option(&str,&par)) { prof_shift = par; - prof_on = 1; + prof_on = CPU_PROFILING; printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", prof_shift); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b14b4a467729..c3ac348e2368 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -82,7 +82,8 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ read_lock(&tasklist_lock); if ((child->ptrace & PT_PTRACED) && child->parent == current && - child->signal != NULL) { + (!(child->ptrace & PT_ATTACHED) || child->real_parent != current) + && child->signal != NULL) { ret = 0; spin_lock_irq(&child->sighand->siglock); if (child->state == TASK_STOPPED) { @@ -131,7 +132,7 @@ int ptrace_attach(struct task_struct *task) goto bad; /* Go */ - task->ptrace |= PT_PTRACED; + task->ptrace |= PT_PTRACED | PT_ATTACHED; if (capable(CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; task_unlock(task); @@ -162,7 +163,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) write_lock_irq(&tasklist_lock); __ptrace_unlink(child); /* .. and wake it up. */ - if (child->state != TASK_ZOMBIE) + if (child->exit_state != EXIT_ZOMBIE) wake_up_process(child); write_unlock_irq(&tasklist_lock); diff --git a/kernel/sched.c b/kernel/sched.c index 9b395e8d616d..8e5e2af64509 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -42,6 +42,7 @@ #include <linux/percpu.h> #include <linux/kthread.h> #include <linux/seq_file.h> +#include <linux/syscalls.h> #include <linux/times.h> #include <asm/tlb.h> @@ -180,17 +181,8 @@ static unsigned int task_timeslice(task_t *p) else return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } -#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) - -enum idle_type -{ - IDLE, - NOT_IDLE, - NEWLY_IDLE, - MAX_IDLE_TYPES -}; - -struct sched_domain; +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) /* * These are the runqueue data structures: @@ -289,140 +281,6 @@ struct runqueue { static DEFINE_PER_CPU(struct runqueue, runqueues); -/* - * sched-domains (multiprocessor balancing) declarations: - */ -#ifdef CONFIG_SMP -#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ - -#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 2 /* Balance on exec */ -#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */ - -struct sched_group { - struct sched_group *next; /* Must be a circular list */ - cpumask_t cpumask; - - /* - * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. This should be read only (except for setup). Although - * it will need to be written to at cpu hot(un)plug time, perhaps the - * cpucontrol semaphore will provide enough exclusion? - */ - unsigned long cpu_power; -}; - -struct sched_domain { - /* These fields must be setup */ - struct sched_domain *parent; /* top domain must be null terminated */ - struct sched_group *groups; /* the balancing groups of the domain */ - cpumask_t span; /* span of all CPUs in this domain */ - unsigned long min_interval; /* Minimum balance interval ms */ - unsigned long max_interval; /* Maximum balance interval ms */ - unsigned int busy_factor; /* less balancing by factor if busy */ - unsigned int imbalance_pct; /* No balance until over watermark */ - unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ - unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ - int flags; /* See SD_* */ - - /* Runtime fields. */ - unsigned long last_balance; /* init to jiffies. units in jiffies */ - unsigned int balance_interval; /* initialise to 1. units in ms. */ - unsigned int nr_balance_failed; /* initialise to 0 */ - -#ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ - unsigned long lb_cnt[MAX_IDLE_TYPES]; - unsigned long lb_failed[MAX_IDLE_TYPES]; - unsigned long lb_imbalance[MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[MAX_IDLE_TYPES]; - - /* sched_balance_exec() stats */ - unsigned long sbe_attempts; - unsigned long sbe_pushed; - - /* try_to_wake_up() stats */ - unsigned long ttwu_wake_affine; - unsigned long ttwu_wake_balance; -#endif -}; - -#ifndef ARCH_HAS_SCHED_TUNE -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#define SD_SIBLING_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 8, \ - .imbalance_pct = 110, \ - .cache_hot_time = 0, \ - .cache_nice_tries = 0, \ - .per_cpu_gain = 25, \ - .flags = SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ - | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} -#endif - -/* Common values for CPUs */ -#define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_hot_time = (5*1000000/2), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - -/* Arch can override this macro in processor.h */ -#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT) -#define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} -#endif -#endif /* ARCH_HAS_SCHED_TUNE */ -#endif - - #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) @@ -501,7 +359,7 @@ static int show_schedstat(struct seq_file *seq, void *v) rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) seq_printf(seq, " %lu %lu", rq->pt_gained[itype], rq->pt_lost[itype]); seq_printf(seq, "\n"); @@ -513,7 +371,8 @@ static int show_schedstat(struct seq_file *seq, void *v) cpumask_scnprintf(mask_str, NR_CPUS, sd->span); seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) { + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; + itype++) { seq_printf(seq, " %lu %lu %lu %lu %lu", sd->lb_cnt[itype], sd->lb_failed[itype], @@ -1087,8 +946,7 @@ static int wake_idle(int cpu, task_t *p) if (!(sd->flags & SD_WAKE_IDLE)) return cpu; - cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); + cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) @@ -1123,14 +981,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) int cpu, this_cpu, success = 0; unsigned long flags; long old_state; - runqueue_t *rq; + runqueue_t *rq, *old_rq; #ifdef CONFIG_SMP unsigned long load, this_load; struct sched_domain *sd; int new_cpu; #endif - rq = task_rq_lock(p, &flags); + old_rq = rq = task_rq_lock(p, &flags); schedstat_inc(rq, ttwu_cnt); old_state = p->state; if (!(old_state & state)) @@ -1225,7 +1083,7 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; + old_rq->nr_uninterruptible--; /* * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. @@ -1471,10 +1329,10 @@ static void finish_task_switch(task_t *prev) /* * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls - * schedule one last time. The schedule call will never return, + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, * and the scheduled task must drop that reference. - * The test for TASK_ZOMBIE must occur while the runqueue locks are + * The test for EXIT_ZOMBIE must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. @@ -1640,8 +1498,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu, min_cpu = UINT_MAX; min_load = ULONG_MAX; - cpus_and(mask, sd->span, cpu_online_map); - cpus_and(mask, mask, p->cpus_allowed); + cpus_and(mask, sd->span, p->cpus_allowed); for_each_cpu_mask(i, mask) { load = target_load(i); @@ -1893,7 +1750,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; do { - cpumask_t tmp; unsigned long load; int local_group; int i, nr_cpus = 0; @@ -1902,11 +1758,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Tally up the load of all CPUs in the group */ avg_load = 0; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i); @@ -2011,7 +1864,7 @@ nextgroup: out_balanced: if (busiest && (idle == NEWLY_IDLE || - (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { + (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { *imbalance = 1; return busiest; } @@ -2025,13 +1878,11 @@ out_balanced: */ static runqueue_t *find_busiest_queue(struct sched_group *group) { - cpumask_t tmp; unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; - cpus_and(tmp, group->cpumask, cpu_online_map); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, group->cpumask) { load = source_load(i); if (load > max_load) { @@ -2232,18 +2083,13 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) group = sd->groups; do { - cpumask_t tmp; runqueue_t *rq; int push_cpu = 0; if (group == busy_group) goto next_group; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (!cpus_weight(tmp)) - goto next_group; - - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, group->cpumask) { if (!idle_cpu(i)) goto next_group; push_cpu = i; @@ -2260,7 +2106,7 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) if (unlikely(busiest == rq)) goto next_group; double_lock_balance(busiest, rq); - if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) { + if (move_tasks(rq, push_cpu, busiest, 1, sd, SCHED_IDLE)) { schedstat_inc(busiest, alb_lost); schedstat_inc(rq, alb_gained); } else { @@ -2304,9 +2150,13 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, this_rq->cpu_load = (old_load + this_load) / 2; for_each_domain(this_cpu, sd) { - unsigned long interval = sd->balance_interval; + unsigned long interval; - if (idle != IDLE) + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != SCHED_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ @@ -2408,7 +2258,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - rebalance_tick(cpu, rq, IDLE); + rebalance_tick(cpu, rq, SCHED_IDLE); return; } if (TASK_NICE(p) > 0) @@ -2512,7 +2362,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) */ spin_unlock(&this_rq->lock); - cpus_and(sibling_map, sd->span, cpu_online_map); + sibling_map = sd->span; for_each_cpu_mask(i, sibling_map) spin_lock(&cpu_rq(i)->lock); @@ -2557,7 +2407,7 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) * wake_sleeping_dependent(): */ spin_unlock(&this_rq->lock); - cpus_and(sibling_map, sd->span, cpu_online_map); + sibling_map = sd->span; for_each_cpu_mask(i, sibling_map) spin_lock(&cpu_rq(i)->lock); cpu_clear(this_cpu, sibling_map); @@ -2639,12 +2489,15 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { + if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { if (unlikely(in_atomic())) { - printk(KERN_ERR "bad: scheduling while atomic!\n"); + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); dump_stack(); } } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); need_resched: preempt_disable(); @@ -2678,6 +2531,8 @@ need_resched: spin_lock_irq(&rq->lock); + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2764,7 +2619,7 @@ switch_tasks: if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) prev->interactive_credit--; } - prev->timestamp = now; + prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); if (likely(prev != next)) { @@ -3221,7 +3076,6 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) policy != SCHED_NORMAL) goto out_unlock; } - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); /* * Valid priorities for SCHED_FIFO and SCHED_RR are @@ -4068,7 +3922,7 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk) struct runqueue *rq = cpu_rq(dead_cpu); /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD); + BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ BUG_ON(tsk->flags & PF_DEAD); @@ -4209,16 +4063,17 @@ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); #ifdef CONFIG_SMP -/* Attach the domain 'sd' to 'cpu' as its base domain */ -static void cpu_attach_domain(struct sched_domain *sd, int cpu) +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) { migration_req_t req; unsigned long flags; runqueue_t *rq = cpu_rq(cpu); int local = 1; - lock_cpu_hotplug(); - spin_lock_irqsave(&rq->lock, flags); if (cpu == smp_processor_id() || !cpu_online(cpu)) { @@ -4237,128 +4092,10 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) wake_up_process(rq->migration_thread); wait_for_completion(&req.done); } - - unlock_cpu_hotplug(); } -/* - * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN - * in arch code. That defines the number of nearby nodes in a node's top - * level scheduling domain. - */ -#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN) -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int __init find_next_best_node(int node, unsigned long *used_nodes) -{ - int i, n, val, min_val, best_node = 0; - - min_val = INT_MAX; - - for (i = 0; i < numnodes; i++) { - /* Start at @node */ - n = (node + i) % numnodes; - - /* Skip already used nodes */ - if (test_bit(n, used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, i); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - set_bit(best_node, used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -cpumask_t __init sched_domain_node_span(int node) -{ - int i; - cpumask_t span; - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); - - for (i = 0; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); - cpumask_t nodemask; - - nodemask = node_to_cpumask(next_node); - cpus_or(span, span, nodemask); - } - - return span; -} -#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */ -cpumask_t __init sched_domain_node_span(int node) -{ - return cpu_possible_map; -} -#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */ - -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; -__init static int cpu_to_cpu_group(int cpu) -{ - return cpu; -} -#endif - -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; -__init static int cpu_to_phys_group(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - return first_cpu(cpu_sibling_map[cpu]); -#else - return cpu; -#endif -} - -#ifdef CONFIG_NUMA - -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -__init static int cpu_to_node_group(int cpu) -{ - return cpu_to_node(cpu); -} -#endif - -/* Groups for isolated scheduling domains */ -static struct sched_group sched_group_isolated[NR_CPUS]; - /* cpus with isolated domains */ -cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE; - -__init static int cpu_to_isolated_group(int cpu) -{ - return cpu; -} +cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -4385,7 +4122,7 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -__init static void init_sched_build_groups(struct sched_group groups[], +void __devinit init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; @@ -4419,56 +4156,99 @@ __init static void init_sched_build_groups(struct sched_group groups[], last->next = first; } -__init static void arch_init_sched_domains(void) + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __devinit arch_init_sched_domains(void); +extern void __devinit arch_destroy_sched_domains(void); +#else +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA + +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static int __devinit cpu_to_node_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) +/* + * The domains setup code relies on siblings not spanning + * multiple nodes. Make sure the architecture has a proper + * siblings map: + */ +static void check_sibling_maps(void) +{ + int i, j; + + for_each_online_cpu(i) { + for_each_cpu_mask(j, cpu_sibling_map[i]) { + if (cpu_to_node(i) != cpu_to_node(j)) { + printk(KERN_INFO "warning: CPU %d siblings map " + "to different node - isolating " + "them.\n", i); + cpu_sibling_map[i] = cpumask_of_cpu(i); + break; + } + } + } +} +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void __devinit arch_init_sched_domains(void) { int i; cpumask_t cpu_default_map; +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ cpus_complement(cpu_default_map, cpu_isolated_map); - cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); - /* Set up domains */ - for_each_cpu(i) { + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(nodemask, nodemask, cpu_default_map); - /* - * Set up isolated domains. - * Unlike those of other cpus, the domains and groups are - * single level, and span a single cpu. - */ - if (cpu_isset(i, cpu_isolated_map)) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); -#else - sd = &per_cpu(phys_domains, i); -#endif - group = cpu_to_isolated_group(i); - *sd = SD_CPU_INIT; - cpu_set(i, sd->span); - sd->balance_interval = INT_MAX; /* Don't balance */ - sd->flags = 0; /* Avoid WAKE_ */ - sd->groups = &sched_group_isolated[group]; - printk(KERN_INFO "Setting up cpu %d isolated.\n", i); - /* Single level, so continue with next cpu */ - continue; - } - #ifdef CONFIG_NUMA sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - /* FIXME: should be multilevel, in arch code */ - sd->span = sched_domain_node_span(i); - cpus_and(sd->span, sd->span, cpu_default_map); + sd->span = cpu_default_map; sd->groups = &sched_group_nodes[group]; #endif @@ -4476,11 +4256,7 @@ __init static void arch_init_sched_domains(void) sd = &per_cpu(phys_domains, i); group = cpu_to_phys_group(i); *sd = SD_CPU_INIT; -#ifdef CONFIG_NUMA sd->span = nodemask; -#else - sd->span = cpu_possible_map; -#endif sd->parent = p; sd->groups = &sched_group_phys[group]; @@ -4498,7 +4274,7 @@ __init static void arch_init_sched_domains(void) #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu(i) { + for_each_online_cpu(i) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); if (i != first_cpu(this_sibling_map)) @@ -4509,16 +4285,6 @@ __init static void arch_init_sched_domains(void) } #endif - /* Set up isolated groups */ - for_each_cpu_mask(i, cpu_isolated_map) { - cpumask_t mask; - cpus_clear(mask); - cpu_set(i, mask); - init_sched_build_groups(sched_group_isolated, mask, - &cpu_to_isolated_group); - } - -#ifdef CONFIG_NUMA /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -4530,10 +4296,6 @@ __init static void arch_init_sched_domains(void) init_sched_build_groups(sched_group_phys, nodemask, &cpu_to_phys_group); } -#else - init_sched_build_groups(sched_group_phys, cpu_possible_map, - &cpu_to_phys_group); -#endif #ifdef CONFIG_NUMA /* Set up node groups */ @@ -4566,7 +4328,7 @@ __init static void arch_init_sched_domains(void) } /* Attach the domains */ - for_each_cpu(i) { + for_each_online_cpu(i) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -4577,21 +4339,29 @@ __init static void arch_init_sched_domains(void) } } -#undef SCHED_DOMAIN_DEBUG +#ifdef CONFIG_HOTPLUG_CPU +static void __devinit arch_destroy_sched_domains(void) +{ + /* Do nothing: everything is statically allocated. */ +} +#endif + +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG -void sched_domain_debug(void) +static void sched_domain_debug(void) { int i; - for_each_cpu(i) { + for_each_online_cpu(i) { runqueue_t *rq = cpu_rq(i); struct sched_domain *sd; int level = 0; sd = rq->sd; - printk(KERN_DEBUG "CPU%d: %s\n", - i, (cpu_online(i) ? " online" : "offline")); + printk(KERN_DEBUG "CPU%d:\n", i); do { int j; @@ -4605,7 +4375,17 @@ void sched_domain_debug(void) printk(KERN_DEBUG); for (j = 0; j < level + 1; j++) printk(" "); - printk("domain %d: span %s\n", level, str); + printk("domain %d: ", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not balance"); + if (sd->parent) + printk(" ERROR !SD_LOAD_BALANCE domain has parent"); + printk("\n"); + break; + } + + printk("span %s\n", str); if (!cpu_isset(i, sd->span)) printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); @@ -4657,10 +4437,64 @@ void sched_domain_debug(void) #define sched_domain_debug() {} #endif +#ifdef CONFIG_SMP +/* + * Initial dummy domain for early boot and for hotplug cpu. Being static, + * it is initialized to zero, so all balancing flags are cleared which is + * what we want. + */ +static struct sched_domain sched_domain_dummy; +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Force a reinitialization of the sched domains hierarchy. The domains + * and groups cannot be updated in place without racing with the balancing + * code, so we temporarily attach all running cpus to a "dummy" domain + * which will prevent rebalancing while the sched domains are recalculated. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int i; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + for_each_online_cpu(i) + cpu_attach_domain(&sched_domain_dummy, i); + arch_destroy_sched_domains(); + return NOTIFY_OK; + + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + case CPU_DEAD: + /* + * Fall through and re-initialise the domains. + */ + break; + default: + return NOTIFY_DONE; + } + + /* The hotplug lock is already held by cpu_up/cpu_down */ + arch_init_sched_domains(); + + sched_domain_debug(); + + return NOTIFY_OK; +} +#endif + void __init sched_init_smp(void) { + lock_cpu_hotplug(); arch_init_sched_domains(); sched_domain_debug(); + unlock_cpu_hotplug(); + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); } #else void __init sched_init_smp(void) @@ -4682,24 +4516,6 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; -#ifdef CONFIG_SMP - /* Set up an initial dummy domain for early boot */ - static struct sched_domain sched_domain_init; - static struct sched_group sched_group_init; - - memset(&sched_domain_init, 0, sizeof(struct sched_domain)); - sched_domain_init.span = CPU_MASK_ALL; - sched_domain_init.groups = &sched_group_init; - sched_domain_init.last_balance = jiffies; - sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ - sched_domain_init.busy_factor = 1; - - memset(&sched_group_init, 0, sizeof(struct sched_group)); - sched_group_init.cpumask = CPU_MASK_ALL; - sched_group_init.next = &sched_group_init; - sched_group_init.cpu_power = SCHED_LOAD_SCALE; -#endif - for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; @@ -4710,7 +4526,7 @@ void __init sched_init(void) rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP - rq->sd = &sched_domain_init; + rq->sd = &sched_domain_dummy; rq->cpu_load = 0; rq->active_balance = 0; rq->push_cpu = 0; diff --git a/kernel/signal.c b/kernel/signal.c index df2cd4216838..f67390806d73 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -20,6 +20,7 @@ #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/security.h> +#include <linux/syscalls.h> #include <linux/ptrace.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -269,7 +270,7 @@ static struct sigqueue *__sigqueue_alloc(void) struct sigqueue *q = NULL; if (atomic_read(¤t->user->sigpending) < - current->rlim[RLIMIT_SIGPENDING].rlim_cur) + current->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); if (q) { INIT_LIST_HEAD(&q->list); @@ -764,7 +765,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, pass on the info struct. */ if (atomic_read(&t->user->sigpending) < - t->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); if (q) { @@ -913,7 +914,7 @@ __group_complete_signal(int sig, struct task_struct *p) * Don't bother zombies and stopped tasks (but * SIGKILL will punch through stopped state) */ - mask = TASK_DEAD | TASK_ZOMBIE | TASK_TRACED; + mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED; if (sig != SIGKILL) mask |= TASK_STOPPED; @@ -1069,7 +1070,7 @@ void zap_other_threads(struct task_struct *p) /* * Don't bother with already dead threads */ - if (t->state & (TASK_ZOMBIE|TASK_DEAD)) + if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD)) continue; /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 4a3da9be9f26..0e1e1356e35a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -137,11 +137,17 @@ EXPORT_SYMBOL(do_softirq); void local_bh_enable(void) { - __local_bh_enable(); WARN_ON(irqs_disabled()); - if (unlikely(!in_interrupt() && - local_softirq_pending())) + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + preempt_count() -= SOFTIRQ_OFFSET - 1; + + if (unlikely(!in_interrupt() && local_softirq_pending())) invoke_softirq(); + + dec_preempt_count(); preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable); diff --git a/kernel/sys.c b/kernel/sys.c index 571bba8c8989..a95e3900dc1e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -24,6 +24,11 @@ #include <linux/dcookies.h> #include <linux/suspend.h> +/* Don't include this - it breaks ia64's cond_syscall() implementation */ +#if 0 +#include <linux/syscalls.h> +#endif + #include <asm/uaccess.h> #include <asm/io.h> #include <asm/unistd.h> @@ -649,7 +654,7 @@ static int set_user(uid_t new_ruid, int dumpclear) return -EAGAIN; if (atomic_read(&new_user->processes) >= - current->rlim[RLIMIT_NPROC].rlim_cur && + current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != &root_user) { free_uid(new_user); return -EAGAIN; @@ -1496,9 +1501,13 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) { if (resource >= RLIM_NLIMITS) return -EINVAL; - else - return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim)) - ? -EFAULT : 0; + else { + struct rlimit value; + task_lock(current->group_leader); + value = current->signal->rlim[resource]; + task_unlock(current->group_leader); + return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + } } #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT @@ -1513,7 +1522,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r if (resource >= RLIM_NLIMITS) return -EINVAL; - memcpy(&x, current->rlim + resource, sizeof(*rlim)); + task_lock(current->group_leader); + x = current->signal->rlim[resource]; + task_unlock(current->group_leader); if(x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; if(x.rlim_max > 0x7FFFFFFF) @@ -1534,21 +1545,20 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) return -EFAULT; if (new_rlim.rlim_cur > new_rlim.rlim_max) return -EINVAL; - old_rlim = current->rlim + resource; - if (((new_rlim.rlim_cur > old_rlim->rlim_max) || - (new_rlim.rlim_max > old_rlim->rlim_max)) && + old_rlim = current->signal->rlim + resource; + if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE) { - if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN) + if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) return -EPERM; - } retval = security_task_setrlimit(resource, &new_rlim); if (retval) return retval; + task_lock(current->group_leader); *old_rlim = new_rlim; + task_unlock(current->group_leader); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99a0af0ed9a8..469cf0c2f26e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -40,6 +40,7 @@ #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> #include <asm/processor.h> diff --git a/kernel/time.c b/kernel/time.c index 167a72a078ea..0b4797fa3d30 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -31,6 +31,7 @@ #include <linux/timex.h> #include <linux/errno.h> #include <linux/smp_lock.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> #include <asm/unistd.h> diff --git a/kernel/timer.c b/kernel/timer.c index 36e40d8b182d..e3c9b5fcd52f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -31,11 +31,13 @@ #include <linux/time.h> #include <linux/jiffies.h> #include <linux/cpu.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> +#include <asm/io.h> #ifdef CONFIG_TIME_INTERPOLATION static void time_interpolator_update(long delta_nsec); @@ -788,13 +790,12 @@ static void update_wall_time(unsigned long ticks) do { ticks--; update_wall_time_one_tick(); + if (xtime.tv_nsec >= 1000000000) { + xtime.tv_nsec -= 1000000000; + xtime.tv_sec++; + second_overflow(); + } } while (ticks); - - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; - xtime.tv_sec++; - second_overflow(); - } } static inline void do_process_times(struct task_struct *p, @@ -804,12 +805,13 @@ static inline void do_process_times(struct task_struct *p, psecs = (p->utime += user); psecs += (p->stime += system); - if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) { + if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) && + psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) { /* Send SIGXCPU every second.. */ if (!(psecs % HZ)) send_sig(SIGXCPU, p, 1); /* and SIGKILL when we go over max.. */ - if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max) + if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max) send_sig(SIGKILL, p, 1); } } @@ -1624,13 +1626,13 @@ EXPORT_SYMBOL(msleep); */ unsigned long msleep_interruptible(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs); + unsigned long timeout = msecs_to_jiffies(msecs); - while (timeout && !signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } - return jiffies_to_msecs(timeout); + while (timeout && !signal_pending(current)) { + set_current_state(TASK_INTERRUPTIBLE); + timeout = schedule_timeout(timeout); + } + return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); |
