diff options
| author | Tony Luck <tony.luck@intel.com> | 2005-01-17 20:28:07 -0800 |
|---|---|---|
| committer | Tony Luck <tony.luck@intel.com> | 2005-01-17 20:28:07 -0800 |
| commit | 9790ee6f6a30db6b9fd4f27e437b87fa8e511cdd (patch) | |
| tree | 5283a41fba9fe6a85c0ed186ad0469dd0cc04323 /kernel | |
| parent | bcb39c06eefc908121323c23ead2fdd246585ad3 (diff) | |
| parent | fdefff6242ce95e350570a7f65e1ff49bc3a66b6 (diff) | |
Merge ia64 test tree back into release tree.
Diffstat (limited to 'kernel')
42 files changed, 1529 insertions, 1032 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 4a3bd224a836..32e39accbb86 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -528,3 +528,36 @@ void acct_process(long exitcode) do_acct_process(exitcode, file); fput(file); } + + +/* + * acct_update_integrals + * - update mm integral fields in task_struct + */ +void acct_update_integrals(void) +{ + struct task_struct *tsk = current; + + if (likely(tsk->mm)) { + long delta = tsk->stime - tsk->acct_stimexpd; + + if (delta == 0) + return; + tsk->acct_stimexpd = tsk->stime; + tsk->acct_rss_mem1 += delta * tsk->mm->rss; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + } +} + +/* + * acct_clear_integrals + * - clear the mm integral fields in task_struct + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + if (tsk) { + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; + } +} diff --git a/kernel/audit.c b/kernel/audit.c index d813b7aa4b4c..e21f947bacf2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -98,8 +98,8 @@ static struct sock *audit_sock; * The second list is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ -static spinlock_t audit_txlist_lock = SPIN_LOCK_UNLOCKED; -static spinlock_t audit_freelist_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(audit_txlist_lock); +static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count = 0; static LIST_HEAD(audit_txlist); static LIST_HEAD(audit_freelist); @@ -169,7 +169,7 @@ static inline int audit_rate_check(void) { static unsigned long last_check = 0; static int messages = 0; - static spinlock_t lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; unsigned long elapsed; @@ -199,7 +199,7 @@ static inline int audit_rate_check(void) void audit_log_lost(const char *message) { static unsigned long last_msg = 0; - static spinlock_t lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; int print; @@ -419,7 +419,7 @@ static int audit_receive_skb(struct sk_buff *skb) if (rlen > skb->len) rlen = skb->len; if ((err = audit_receive_msg(skb, nlh))) { - netlink_ack(skb, nlh, -err); + netlink_ack(skb, nlh, err); } else if (nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(skb, nlh, 0); skb_pull(skb, rlen); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0962944e8357..aa617bbe49b0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -591,7 +591,7 @@ static void audit_log_exit(struct audit_context *context) if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); if (context->return_valid) - audit_log_format(ab, " exit=%u", context->return_code); + audit_log_format(ab, " exit=%d", context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " pid=%d loginuid=%d uid=%d gid=%d" diff --git a/kernel/capability.c b/kernel/capability.c index 7800a5066c0f..b828d545a97b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -23,7 +23,7 @@ EXPORT_SYMBOL(cap_bset); * This global lock protects task->cap_* for all tasks including current. * Locking rule: acquire this prior to tasklist_lock. */ -spinlock_t task_capability_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(task_capability_lock); /* * For sys_getproccap() and sys_setproccap(), any of the three @@ -85,34 +85,60 @@ out: * cap_set_pg - set capabilities for all processes in a given process * group. We call this holding task_capability_lock and tasklist_lock. */ -static inline void cap_set_pg(int pgrp, kernel_cap_t *effective, +static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { task_t *g, *target; + int ret = -EPERM; + int found = 0; do_each_task_pid(pgrp, PIDTYPE_PGID, g) { target = g; - while_each_thread(g, target) - security_capset_set(target, effective, inheritable, permitted); + while_each_thread(g, target) { + if (!security_capset_check(target, effective, + inheritable, + permitted)) { + security_capset_set(target, effective, + inheritable, + permitted); + ret = 0; + } + found = 1; + } } while_each_task_pid(pgrp, PIDTYPE_PGID, g); + + if (!found) + ret = 0; + return ret; } /* * cap_set_all - set capabilities for all processes other than init * and self. We call this holding task_capability_lock and tasklist_lock. */ -static inline void cap_set_all(kernel_cap_t *effective, +static inline int cap_set_all(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { task_t *g, *target; + int ret = -EPERM; + int found = 0; do_each_thread(g, target) { if (target == current || target->pid == 1) continue; + found = 1; + if (security_capset_check(target, effective, inheritable, + permitted)) + continue; + ret = 0; security_capset_set(target, effective, inheritable, permitted); } while_each_thread(g, target); + + if (!found) + ret = 0; + return ret; } /* @@ -147,7 +173,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(pid, &header->pid)) return -EFAULT; - if (pid && !capable(CAP_SETPCAP)) + if (pid && pid != current->pid && !capable(CAP_SETPCAP)) return -EPERM; if (copy_from_user(&effective, &data->effective, sizeof(effective)) || @@ -167,36 +193,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) } else target = current; - ret = -EPERM; - - if (security_capset_check(target, &effective, &inheritable, &permitted)) - goto out; - - if (!cap_issubset(inheritable, cap_combine(target->cap_inheritable, - current->cap_permitted))) - goto out; - - /* verify restrictions on target's new Permitted set */ - if (!cap_issubset(permitted, cap_combine(target->cap_permitted, - current->cap_permitted))) - goto out; - - /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ - if (!cap_issubset(effective, permitted)) - goto out; - ret = 0; /* having verified that the proposed changes are legal, we now put them into effect. */ if (pid < 0) { if (pid == -1) /* all procs other than current and init */ - cap_set_all(&effective, &inheritable, &permitted); + ret = cap_set_all(&effective, &inheritable, &permitted); else /* all procs in process group */ - cap_set_pg(-pid, &effective, &inheritable, &permitted); + ret = cap_set_pg(-pid, &effective, &inheritable, + &permitted); } else { - security_capset_set(target, &effective, &inheritable, &permitted); + ret = security_capset_check(target, &effective, &inheritable, + &permitted); + if (!ret) + security_capset_set(target, &effective, &inheritable, + &permitted); } out: diff --git a/kernel/compat.c b/kernel/compat.c index 672310635347..f14fbde52bb3 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -20,6 +20,7 @@ #include <linux/futex.h> /* for FUTEX_WAIT */ #include <linux/syscalls.h> #include <linux/unistd.h> +#include <linux/security.h> #include <asm/uaccess.h> @@ -162,15 +163,15 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) struct compat_tms tmp; struct task_struct *tsk = current; struct task_struct *t; - unsigned long utime, stime, cutime, cstime; + cputime_t utime, stime, cutime, cstime; read_lock(&tasklist_lock); utime = tsk->signal->utime; stime = tsk->signal->stime; t = tsk; do { - utime += t->utime; - stime += t->stime; + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); t = next_thread(t); } while (t != tsk); @@ -189,10 +190,10 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) spin_unlock_irq(&tsk->sighand->siglock); read_unlock(&tasklist_lock); - tmp.tms_utime = compat_jiffies_to_clock_t(utime); - tmp.tms_stime = compat_jiffies_to_clock_t(stime); - tmp.tms_cutime = compat_jiffies_to_clock_t(cutime); - tmp.tms_cstime = compat_jiffies_to_clock_t(cstime); + tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); + tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); + tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); + tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } @@ -680,3 +681,128 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, return 0; } + +void +sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +{ + switch (_NSIG_WORDS) { +#if defined (__COMPAT_ENDIAN_SWAP__) + case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); + case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); + case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); + case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); +#else + case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); + case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); + case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); + case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); +#endif + } +} + +asmlinkage long +compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, + struct compat_siginfo __user *uinfo, + struct compat_timespec __user *uts, compat_size_t sigsetsize) +{ + compat_sigset_t s32; + sigset_t s; + int sig; + struct timespec t; + siginfo_t info; + long ret, timeout = 0; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&s, &s32); + sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); + signotset(&s); + + if (uts) { + if (get_compat_timespec (&t, uts)) + return -EFAULT; + if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 + || t.tv_sec < 0) + return -EINVAL; + } + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &s, &info); + if (!sig) { + timeout = MAX_SCHEDULE_TIMEOUT; + if (uts) + timeout = timespec_to_jiffies(&t) + +(t.tv_sec || t.tv_nsec); + if (timeout) { + current->real_blocked = current->blocked; + sigandsets(¤t->blocked, ¤t->blocked, &s); + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + timeout = schedule_timeout(timeout); + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &s, &info); + current->blocked = current->real_blocked; + siginitset(¤t->real_blocked, 0); + recalc_sigpending(); + } + } + spin_unlock_irq(¤t->sighand->siglock); + + if (sig) { + ret = sig; + if (uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + }else { + ret = timeout?-EINTR:-EAGAIN; + } + return ret; + +} + +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ + +asmlinkage long compat_sys_time(compat_time_t __user * tloc) +{ + compat_time_t i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + i = -EFAULT; + } + return i; +} + +asmlinkage long compat_sys_stime(compat_time_t __user *tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ diff --git a/kernel/cpu.c b/kernel/cpu.c index ebaba873ebad..628f4ccda127 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -48,7 +48,9 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { - if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0)) + if (task_cpu(p) == cpu && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ (state = %ld, flags = %lx) \n", p->comm, p->pid, cpu, p->state, p->flags); @@ -132,7 +134,8 @@ int cpu_down(unsigned int cpu) __cpu_die(cpu); /* Move it here so it can run. */ - kthread_bind(p, smp_processor_id()); + kthread_bind(p, get_cpu()); + put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) diff --git a/kernel/dma.c b/kernel/dma.c index 940d02c50879..aef0a45b7893 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -38,7 +38,7 @@ */ -spinlock_t dma_spin_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(dma_spin_lock); /* * If our port doesn't define this it has no PC like DMA diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index ad3e5d54e119..867d6dbeb574 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -22,7 +22,7 @@ static void default_handler(int, struct pt_regs *); static struct exec_domain *exec_domains = &default_exec_domain; -static rwlock_t exec_domains_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(exec_domains_lock); static u_long ident_map[32] = { diff --git a/kernel/exit.c b/kernel/exit.c index 64bc9502cd1c..3171228f25c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -159,7 +159,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task - || p->exit_state >= EXIT_ZOMBIE + || p->exit_state || p->real_parent->pid == 1) continue; if (process_group(p->real_parent) != pgrp @@ -332,7 +332,9 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); + down(&tty_sem); current->signal->tty = NULL; + up(&tty_sem); /* Block and flush all signals */ sigfillset(&blocked); @@ -470,7 +472,7 @@ EXPORT_SYMBOL_GPL(exit_fs); * Turn us into a lazy TLB process if we * aren't already.. */ -static inline void __exit_mm(struct task_struct * tsk) +void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; @@ -506,18 +508,13 @@ static inline void __exit_mm(struct task_struct * tsk) mmput(mm); } -void exit_mm(struct task_struct *tsk) -{ - __exit_mm(tsk); -} - static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) { /* * Make sure we're not reparenting to ourselves and that * the parent is not a zombie. */ - BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE); + BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); p->real_parent = reaper; if (p->parent == p->real_parent) BUG(); @@ -560,7 +557,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) * a normal stop since it's no longer being * traced. */ - p->state = TASK_STOPPED; + ptrace_untrace(p); } } @@ -599,7 +596,7 @@ static inline void forget_original_parent(struct task_struct * father, reaper = child_reaper; break; } - } while (reaper->exit_state >= EXIT_ZOMBIE); + } while (reaper->exit_state); /* * There are only two places where our children can be: @@ -656,7 +653,7 @@ static void exit_notify(struct task_struct *tsk) struct task_struct *t; struct list_head ptrace_dead, *_p, *_n; - if (signal_pending(tsk) && !tsk->signal->group_exit + if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) && !thread_group_empty(tsk)) { /* * This occurs when there was a race between our exit @@ -750,7 +747,9 @@ static void exit_notify(struct task_struct *tsk) } state = EXIT_ZOMBIE; - if (tsk->exit_signal == -1 && tsk->ptrace == 0) + if (tsk->exit_signal == -1 && + (likely(tsk->ptrace == 0) || + unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) state = EXIT_DEAD; tsk->exit_state = state; @@ -758,8 +757,8 @@ static void exit_notify(struct task_struct *tsk) * Clear these here so that update_process_times() won't try to deliver * itimer, profile or rlimit signals to this task while it is in late exit. */ - tsk->it_virt_value = 0; - tsk->it_prof_value = 0; + tsk->it_virt_value = cputime_zero; + tsk->it_prof_value = cputime_zero; write_unlock_irq(&tasklist_lock); @@ -793,6 +792,12 @@ fastcall NORET_TYPE void do_exit(long code) panic("Attempted to kill init!"); if (tsk->io_context) exit_io_context(); + + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { + current->ptrace_message = code; + ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); + } + tsk->flags |= PF_EXITING; del_timer_sync(&tsk->real_timer); @@ -801,15 +806,12 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } - + acct_update_integrals(); + update_mem_hiwater(); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) acct_process(code); - __exit_mm(tsk); + exit_mm(tsk); exit_sem(tsk); __exit_files(tsk); @@ -877,18 +879,18 @@ do_group_exit(int exit_code) { BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - if (current->signal->group_exit) + if (current->signal->flags & SIGNAL_GROUP_EXIT) exit_code = current->signal->group_exit_code; else if (!thread_group_empty(current)) { struct signal_struct *const sig = current->signal; struct sighand_struct *const sighand = current->sighand; read_lock(&tasklist_lock); spin_lock_irq(&sighand->siglock); - if (sig->group_exit) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { - sig->group_exit = 1; + sig->flags = SIGNAL_GROUP_EXIT; sig->group_exit_code = exit_code; zap_other_threads(current); } @@ -1046,10 +1048,16 @@ static int wait_task_zombie(task_t *p, int noreap, * here reaping other children at the same time. */ spin_lock_irq(&p->parent->sighand->siglock); - p->parent->signal->cutime += - p->utime + p->signal->utime + p->signal->cutime; - p->parent->signal->cstime += - p->stime + p->signal->stime + p->signal->cstime; + p->parent->signal->cutime = + cputime_add(p->parent->signal->cutime, + cputime_add(p->utime, + cputime_add(p->signal->utime, + p->signal->cutime))); + p->parent->signal->cstime = + cputime_add(p->parent->signal->cstime, + cputime_add(p->stime, + cputime_add(p->signal->stime, + p->signal->cstime))); p->parent->signal->cmin_flt += p->min_flt + p->signal->min_flt + p->signal->cmin_flt; p->parent->signal->cmaj_flt += @@ -1068,7 +1076,7 @@ static int wait_task_zombie(task_t *p, int noreap, read_unlock(&tasklist_lock); retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; - status = p->signal->group_exit + status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; if (!retval && stat_addr) retval = put_user(status, stat_addr); @@ -1180,7 +1188,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, * race with the EXIT_ZOMBIE case. */ exit_code = xchg(&p->exit_code, 0); - if (unlikely(p->exit_state >= EXIT_ZOMBIE)) { + if (unlikely(p->exit_state)) { /* * The task resumed and then died. Let the next iteration * catch it in EXIT_ZOMBIE. Note that exit_code might @@ -1258,16 +1266,17 @@ static int wait_task_continued(task_t *p, int noreap, if (unlikely(!p->signal)) return 0; - if (p->signal->stop_state >= 0) + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); - if (p->signal->stop_state >= 0) { /* Re-check with the lock held. */ + /* Re-check with the lock held. */ + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!noreap) - p->signal->stop_state = 0; + p->signal->flags &= ~SIGNAL_STOP_CONTINUED; spin_unlock_irq(&p->sighand->siglock); pid = p->pid; @@ -1316,7 +1325,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop, struct task_struct *tsk; int flag, retval; - add_wait_queue(¤t->wait_chldexit,&wait); + add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* * We will set this flag if we see any child that might later @@ -1430,7 +1439,7 @@ check_continued: retval = -ECHILD; end: current->state = TASK_RUNNING; - remove_wait_queue(¤t->wait_chldexit,&wait); + remove_wait_queue(¤t->signal->wait_chldexit,&wait); if (infop) { if (retval > 0) retval = 0; diff --git a/kernel/fork.c b/kernel/fork.c index 84252e055db4..be1ff8ddbb9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -39,6 +39,7 @@ #include <linux/audit.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/acct.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -47,17 +48,17 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> -/* The idle threads do not count.. - * Protected by write_lock_irq(&tasklist_lock) +/* + * Protected counters by write_lock_irq(&tasklist_lock) */ -int nr_threads; - -int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ +int nr_threads; /* The idle threads do not count.. */ + +int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ EXPORT_SYMBOL(tasklist_lock); @@ -218,6 +219,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) /* insert tmp into the share list, just after mpnt */ spin_lock(&file->f_mapping->i_mmap_lock); + tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); @@ -279,7 +281,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ -spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) @@ -469,6 +471,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (retval) goto free_pt; + mm->hiwater_rss = mm->rss; + mm->hiwater_vm = mm->total_vm; + good_mm: tsk->mm = mm; tsk->active_mm = mm; @@ -729,11 +734,11 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts return -ENOMEM; atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); - sig->group_exit = 0; + init_waitqueue_head(&sig->wait_chldexit); + sig->flags = 0; sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; - sig->stop_state = 0; sig->curr_target = NULL; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -744,7 +749,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = 0; - sig->utime = sig->stime = sig->cutime = sig->cstime = 0; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; @@ -857,7 +862,6 @@ static task_t *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); - init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); spin_lock_init(&p->proc_lock); @@ -865,12 +869,23 @@ static task_t *copy_process(unsigned long clone_flags, clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); - p->it_real_value = p->it_virt_value = p->it_prof_value = 0; - p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; + p->it_real_value = 0; + p->it_real_incr = 0; + p->it_virt_value = cputime_zero; + p->it_virt_incr = cputime_zero; + p->it_prof_value = cputime_zero; + p->it_prof_incr = cputime_zero; init_timer(&p->real_timer); p->real_timer.data = (unsigned long) p; - p->utime = p->stime = 0; + p->utime = cputime_zero; + p->stime = cputime_zero; + p->rchar = 0; /* I/O counter: bytes read */ + p->wchar = 0; /* I/O counter: bytes written */ + p->syscr = 0; /* I/O counter: read syscalls */ + p->syscw = 0; /* I/O counter: write syscalls */ + acct_clear_integrals(p); + p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->security = NULL; @@ -985,7 +1000,7 @@ static task_t *copy_process(unsigned long clone_flags, * do not create this new thread - the whole thread * group is supposed to exit anyway. */ - if (current->signal->group_exit) { + if (current->signal->flags & SIGNAL_GROUP_EXIT) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -EAGAIN; @@ -1020,6 +1035,7 @@ static task_t *copy_process(unsigned long clone_flags, } nr_threads++; + total_forks++; write_unlock_irq(&tasklist_lock); retval = 0; @@ -1152,7 +1168,6 @@ long do_fork(unsigned long clone_flags, wake_up_new_task(p, clone_flags); else p->state = TASK_STOPPED; - ++total_forks; if (unlikely (trace)) { current->ptrace_message = pid; diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 09f556507f57..9078649382cf 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -14,7 +14,7 @@ */ static struct list_head ime_list = LIST_HEAD_INIT(ime_list); -static spinlock_t ime_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(ime_lock); static int kmalloc_failed; struct inter_module_entry { diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 16818726cd21..98d62d8efeaf 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -137,6 +137,7 @@ unsigned int probe_irq_mask(unsigned long val) return mask & val; } +EXPORT_SYMBOL(probe_irq_mask); /** * probe_irq_off - end an interrupt autodetect diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ebc25823b73d..2fb0e46e11f3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -73,17 +73,6 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) } /* - * Exit an interrupt context. Process softirqs if needed and possible: - */ -void irq_exit(void) -{ - preempt_count() -= IRQ_EXIT_OFFSET; - if (!in_interrupt() && local_softirq_pending()) - do_softirq(); - preempt_enable_no_resched(); -} - -/* * Have got an event to handle: */ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, diff --git a/kernel/itimer.c b/kernel/itimer.c index 95fbf1c6becf..e1743c563206 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -16,11 +16,10 @@ int do_getitimer(int which, struct itimerval *value) { - register unsigned long val, interval; + register unsigned long val; switch (which) { case ITIMER_REAL: - interval = current->it_real_incr; val = 0; /* * FIXME! This needs to be atomic, in case the kernel timer happens! @@ -32,20 +31,20 @@ int do_getitimer(int which, struct itimerval *value) if ((long) val <= 0) val = 1; } + jiffies_to_timeval(val, &value->it_value); + jiffies_to_timeval(current->it_real_incr, &value->it_interval); break; case ITIMER_VIRTUAL: - val = current->it_virt_value; - interval = current->it_virt_incr; + cputime_to_timeval(current->it_virt_value, &value->it_value); + cputime_to_timeval(current->it_virt_incr, &value->it_interval); break; case ITIMER_PROF: - val = current->it_prof_value; - interval = current->it_prof_incr; + cputime_to_timeval(current->it_prof_value, &value->it_value); + cputime_to_timeval(current->it_prof_incr, &value->it_interval); break; default: return(-EINVAL); } - jiffies_to_timeval(val, &value->it_value); - jiffies_to_timeval(interval, &value->it_interval); return 0; } @@ -81,37 +80,43 @@ void it_real_fn(unsigned long __data) int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { - register unsigned long i, j; + unsigned long expire; + cputime_t cputime; int k; - i = timeval_to_jiffies(&value->it_interval); - j = timeval_to_jiffies(&value->it_value); if (ovalue && (k = do_getitimer(which, ovalue)) < 0) return k; switch (which) { case ITIMER_REAL: del_timer_sync(¤t->real_timer); - current->it_real_value = j; - current->it_real_incr = i; - if (!j) + expire = timeval_to_jiffies(&value->it_value); + current->it_real_value = expire; + current->it_real_incr = + timeval_to_jiffies(&value->it_interval); + if (!expire) break; - if (j > (unsigned long) LONG_MAX) - j = LONG_MAX; - i = j + jiffies; - current->real_timer.expires = i; + if (expire > (unsigned long) LONG_MAX) + expire = LONG_MAX; + current->real_timer.expires = jiffies + expire; add_timer(¤t->real_timer); break; case ITIMER_VIRTUAL: - if (j) - j++; - current->it_virt_value = j; - current->it_virt_incr = i; + cputime = timeval_to_cputime(&value->it_value); + if (cputime_gt(cputime, cputime_zero)) + cputime = cputime_add(cputime, + jiffies_to_cputime(1)); + current->it_virt_value = cputime; + cputime = timeval_to_cputime(&value->it_interval); + current->it_virt_incr = cputime; break; case ITIMER_PROF: - if (j) - j++; - current->it_prof_value = j; - current->it_prof_incr = i; + cputime = timeval_to_cputime(&value->it_value); + if (cputime_gt(cputime, cputime_zero)) + cputime = cputime_add(cputime, + jiffies_to_cputime(1)); + current->it_prof_value = cputime; + cputime = timeval_to_cputime(&value->it_interval); + current->it_prof_incr = cputime; break; default: return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8db13f565ed9..315751c2b09b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -20,6 +20,8 @@ #include <linux/proc_fs.h> #include <linux/mm.h> +#include <asm/sections.h> + #ifdef CONFIG_KALLSYMS_ALL #define all_var 1 #else @@ -28,7 +30,7 @@ /* These will be re-linked against their real values during the second link stage */ extern unsigned long kallsyms_addresses[] __attribute__((weak)); -extern unsigned long kallsyms_num_syms __attribute__((weak)); +extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); extern u8 kallsyms_names[] __attribute__((weak)); extern u8 kallsyms_token_table[] __attribute__((weak)); @@ -36,9 +38,6 @@ extern u16 kallsyms_token_index[] __attribute__((weak)); extern unsigned long kallsyms_markers[] __attribute__((weak)); -/* Defined by the linker script. */ -extern char _stext[], _etext[], _sinittext[], _einittext[], _end[]; - static inline int is_kernel_inittext(unsigned long addr) { if (addr >= (unsigned long)_sinittext diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d3d1321b0e5c..cc6f72585f1e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -43,7 +43,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; unsigned int kprobe_cpu = NR_CPUS; -static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(kprobe_lock); /* Locks kprobe: irqs must be disabled */ void lock_kprobes(void) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 31f1a60df733..1f064a63f8cf 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -30,7 +30,8 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) KERNEL_ATTR_RO(hotplug_seqnum); #endif -static decl_subsys(kernel, NULL, NULL); +decl_subsys(kernel, NULL, NULL); +EXPORT_SYMBOL_GPL(kernel_subsys); static struct attribute * kernel_attrs[] = { #ifdef CONFIG_HOTPLUG diff --git a/kernel/kthread.c b/kernel/kthread.c index 5689ebb1a250..e377e2244103 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -14,6 +14,12 @@ #include <linux/module.h> #include <asm/semaphore.h> +/* + * We dont want to execute off keventd since it might + * hold a semaphore our callers hold too: + */ +static struct workqueue_struct *helper_wq; + struct kthread_create_info { /* Information passed to kthread() from keventd. */ @@ -126,12 +132,13 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), init_completion(&create.started); init_completion(&create.done); - /* If we're being called to start the first workqueue, we - * can't use keventd. */ - if (!keventd_up()) + /* + * The workqueue needs to start up first: + */ + if (!helper_wq) work.func(work.data); else { - schedule_work(&work); + queue_work(helper_wq, &work); wait_for_completion(&create.done); } if (!IS_ERR(create.result)) { @@ -183,3 +190,13 @@ int kthread_stop(struct task_struct *k) return ret; } EXPORT_SYMBOL(kthread_stop); + +static __init int helper_init(void) +{ + helper_wq = create_singlethread_workqueue("kthread"); + BUG_ON(!helper_wq); + + return 0; +} +core_initcall(helper_init); + diff --git a/kernel/module.c b/kernel/module.c index 0798443ce002..ce427b675b98 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,7 +53,7 @@ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) /* Protects module list */ -static spinlock_t modlist_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(modlist_lock); /* List of modules, protected by module_mutex AND modlist_lock */ static DECLARE_MUTEX(module_mutex); @@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod) for (i = 0; i < NR_CPUS; i++) local_set(&mod->ref[i].count, 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[smp_processor_id()].count, 1); + local_set(&mod->ref[_smp_processor_id()].count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -651,7 +651,8 @@ void symbol_put_addr(void *addr) } EXPORT_SYMBOL_GPL(symbol_put_addr); -static ssize_t show_refcnt(struct module *mod, char *buffer) +static ssize_t show_refcnt(struct module_attribute *mattr, + struct module *mod, char *buffer) { /* sysfs holds a reference */ return sprintf(buffer, "%u\n", module_refcount(mod)-1); @@ -681,13 +682,6 @@ static inline int use_module(struct module *a, struct module *b) static inline void module_unload_init(struct module *mod) { } - -asmlinkage long -sys_delete_module(const char __user *name_user, unsigned int flags) -{ - return -ENOSYS; -} - #endif /* CONFIG_MODULE_UNLOAD */ #ifdef CONFIG_OBSOLETE_MODPARM @@ -936,79 +930,71 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, * J. Corbet <corbet@lwn.net> */ #ifdef CONFIG_KALLSYMS -static void module_sect_attrs_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct module_sections, kobj)); -} - -static ssize_t module_sect_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t module_sect_show(struct module_attribute *mattr, + struct module *mod, char *buf) { struct module_sect_attr *sattr = - container_of(attr, struct module_sect_attr, attr); + container_of(mattr, struct module_sect_attr, mattr); return sprintf(buf, "0x%lx\n", sattr->address); } -static struct sysfs_ops module_sect_ops = { - .show = module_sect_show, -}; - -static struct kobj_type module_sect_ktype = { - .sysfs_ops = &module_sect_ops, - .release = module_sect_attrs_release, -}; - static void add_sect_attrs(struct module *mod, unsigned int nsect, char *secstrings, Elf_Shdr *sechdrs) { - unsigned int nloaded = 0, i; + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; - - if (!mod->mkobj) - return; + struct attribute **gattr; /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) if (sechdrs[i].sh_flags & SHF_ALLOC) nloaded++; - mod->sect_attrs = kmalloc(sizeof(struct module_sections) + - nloaded*sizeof(mod->sect_attrs->attrs[0]), GFP_KERNEL); - if (! mod->sect_attrs) + size[0] = ALIGN(sizeof(*sect_attrs) + + nloaded * sizeof(sect_attrs->attrs[0]), + sizeof(sect_attrs->grp.attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) return; - /* sections entry setup */ - memset(mod->sect_attrs, 0, sizeof(struct module_sections)); - if (kobject_set_name(&mod->sect_attrs->kobj, "sections")) - goto out; - mod->sect_attrs->kobj.parent = &mod->mkobj->kobj; - mod->sect_attrs->kobj.ktype = &module_sect_ktype; - if (kobject_register(&mod->sect_attrs->kobj)) - goto out; + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; - /* And the section attributes. */ - sattr = &mod->sect_attrs->attrs[0]; + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; sattr->address = sechdrs[i].sh_addr; strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, - MODULE_SECT_NAME_LEN); - sattr->attr.name = sattr->name; - sattr->attr.owner = mod; - sattr->attr.mode = S_IRUGO; - (void) sysfs_create_file(&mod->sect_attrs->kobj, &sattr->attr); - sattr++; + MODULE_SECT_NAME_LEN); + sattr->mattr.show = module_sect_show; + sattr->mattr.store = NULL; + sattr->mattr.attr.name = sattr->name; + sattr->mattr.attr.owner = mod; + sattr->mattr.attr.mode = S_IRUGO; + *(gattr++) = &(sattr++)->mattr.attr; } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; return; out: - kfree(mod->sect_attrs); - mod->sect_attrs = NULL; + kfree(sect_attrs); } static void remove_sect_attrs(struct module *mod) { if (mod->sect_attrs) { - kobject_unregister(&mod->sect_attrs->kobj); + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. */ + kfree(mod->sect_attrs); mod->sect_attrs = NULL; } } @@ -1029,11 +1015,11 @@ static inline void remove_sect_attrs(struct module *mod) #ifdef CONFIG_MODULE_UNLOAD static inline int module_add_refcnt_attr(struct module *mod) { - return sysfs_create_file(&mod->mkobj->kobj, &refcnt.attr); + return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr); } static void module_remove_refcnt_attr(struct module *mod) { - return sysfs_remove_file(&mod->mkobj->kobj, &refcnt.attr); + return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr); } #else static inline int module_add_refcnt_attr(struct module *mod) @@ -1052,17 +1038,13 @@ static int mod_sysfs_setup(struct module *mod, { int err; - mod->mkobj = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); - if (!mod->mkobj) - return -ENOMEM; - - memset(&mod->mkobj->kobj, 0, sizeof(mod->mkobj->kobj)); - err = kobject_set_name(&mod->mkobj->kobj, "%s", mod->name); + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); if (err) goto out; - kobj_set_kset_s(mod->mkobj, module_subsys); - mod->mkobj->mod = mod; - err = kobject_register(&mod->mkobj->kobj); + kobj_set_kset_s(&mod->mkobj, module_subsys); + mod->mkobj.mod = mod; + err = kobject_register(&mod->mkobj.kobj); if (err) goto out; @@ -1077,11 +1059,8 @@ static int mod_sysfs_setup(struct module *mod, return 0; out_unreg: - /* Calls module_kobj_release */ - kobject_unregister(&mod->mkobj->kobj); - return err; + kobject_unregister(&mod->mkobj.kobj); out: - kfree(mod->mkobj); return err; } @@ -1090,8 +1069,7 @@ static void mod_kobject_remove(struct module *mod) module_remove_refcnt_attr(mod); module_param_sysfs_remove(mod); - /* Calls module_kobj_release */ - kobject_unregister(&mod->mkobj->kobj); + kobject_unregister(&mod->mkobj.kobj); } /* Free a module, remove from lists, etc (must hold module mutex). */ @@ -1713,6 +1691,9 @@ static struct module *load_module(void __user *umod, / sizeof(struct kernel_param), NULL); } + if (err < 0) + goto arch_cleanup; + err = mod_sysfs_setup(mod, (struct kernel_param *) sechdrs[setupindex].sh_addr, @@ -2089,11 +2070,9 @@ void module_add_driver(struct module *mod, struct device_driver *drv) { if (!mod || !drv) return; - if (!mod->mkobj) - return; /* Don't check return code; this call is idempotent */ - sysfs_create_link(&drv->kobj, &mod->mkobj->kobj, "module"); + sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); } EXPORT_SYMBOL(module_add_driver); diff --git a/kernel/params.c b/kernel/params.c index 45dd451e17c1..ec3dbf68e253 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -77,10 +77,16 @@ static int parse_one(char *param, static char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; - int in_quote = 0; + int in_quote = 0, quoted = 0; + char *next; /* Chew any extra spaces */ while (*args == ' ') args++; + if (*args == '"') { + args++; + in_quote = 1; + quoted = 1; + } for (i = 0; args[i]; i++) { if (args[i] == ' ' && !in_quote) @@ -106,13 +112,16 @@ static char *next_arg(char *args, char **param, char **val) if (args[i-1] == '"') args[i-1] = '\0'; } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; } if (args[i]) { args[i] = '\0'; - return args + i + 1; + next = args + i + 1; } else - return args + i; + next = args + i; + return next; } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ @@ -357,26 +366,23 @@ extern struct kernel_param __start___param[], __stop___param[]; struct param_attribute { - struct attribute attr; + struct module_attribute mattr; struct kernel_param *param; }; -struct param_kobject +struct module_param_attrs { - struct kobject kobj; - - unsigned int num_attributes; - struct param_attribute attr[0]; + struct attribute_group grp; + struct param_attribute attrs[0]; }; -#define to_param_attr(n) container_of(n, struct param_attribute, attr); +#define to_param_attr(n) container_of(n, struct param_attribute, mattr); -static ssize_t param_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) +static ssize_t param_attr_show(struct module_attribute *mattr, + struct module *mod, char *buf) { int count; - struct param_attribute *attribute = to_param_attr(attr); + struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->get) return -EPERM; @@ -390,12 +396,12 @@ static ssize_t param_attr_show(struct kobject *kobj, } /* sysfs always hands a nul-terminated string in buf. We rely on that. */ -static ssize_t param_attr_store(struct kobject *kobj, - struct attribute *attr, +static ssize_t param_attr_store(struct module_attribute *mattr, + struct module *owner, const char *buf, size_t len) { int err; - struct param_attribute *attribute = to_param_attr(attr); + struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->set) return -EPERM; @@ -406,27 +412,6 @@ static ssize_t param_attr_store(struct kobject *kobj, return err; } - -static struct sysfs_ops param_sysfs_ops = { - .show = param_attr_show, - .store = param_attr_store, -}; - -static void param_kobj_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct param_kobject, kobj)); -} - -static struct kobj_type param_ktype = { - .sysfs_ops = ¶m_sysfs_ops, - .release = ¶m_kobj_release, -}; - -static struct kset param_kset = { - .subsys = &module_subsys, - .ktype = ¶m_ktype, -}; - #ifdef CONFIG_MODULES #define __modinit #else @@ -434,54 +419,6 @@ static struct kset param_kset = { #endif /* - * param_add_attribute - actually adds an parameter to sysfs - * @mod: owner of parameter - * @pk: param_kobject the attribute shall be assigned to. - * One per module, one per KBUILD_MODNAME. - * @kp: kernel_param to be added - * @skip: offset where the parameter name start in kp->name. - * Needed for built-in modules - * - * Fill in data into appropriate &pk->attr[], and create sysfs file. - */ -static __modinit int param_add_attribute(struct module *mod, - struct param_kobject *pk, - struct kernel_param *kp, - unsigned int skip) -{ - struct param_attribute *a; - int err; - - a = &pk->attr[pk->num_attributes]; - a->attr.name = (char *) &kp->name[skip]; - a->attr.owner = mod; - a->attr.mode = kp->perm; - a->param = kp; - err = sysfs_create_file(&pk->kobj, &a->attr); - if (!err) - pk->num_attributes++; - return err; -} - -/* - * param_sysfs_remove - remove sysfs support for one module or KBUILD_MODNAME - * @pk: struct param_kobject which is to be removed - * - * Called when an error in registration occurs or a module is removed - * from the system. - */ -static __modinit void param_sysfs_remove(struct param_kobject *pk) -{ - unsigned int i; - for (i = 0; i < pk->num_attributes; i++) - sysfs_remove_file(&pk->kobj,&pk->attr[i].attr); - - /* Calls param_kobj_release */ - kobject_unregister(&pk->kobj); -} - - -/* * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME * @mk: struct module_kobject (contains parent kobject) * @kparam: array of struct kernel_param, the actual parameter definitions @@ -492,15 +429,17 @@ static __modinit void param_sysfs_remove(struct param_kobject *pk) * in sysfs. A pointer to the param_kobject is returned on success, * NULL if there's no parameter to export, or other ERR_PTR(err). */ -static __modinit struct param_kobject * +static __modinit struct module_param_attrs * param_sysfs_setup(struct module_kobject *mk, struct kernel_param *kparam, unsigned int num_params, unsigned int name_skip) { - struct param_kobject *pk; + struct module_param_attrs *mp; unsigned int valid_attrs = 0; - unsigned int i; + unsigned int i, size[2]; + struct param_attribute *pattr; + struct attribute **gattr; int err; for (i=0; i<num_params; i++) { @@ -511,42 +450,39 @@ param_sysfs_setup(struct module_kobject *mk, if (!valid_attrs) return NULL; - pk = kmalloc(sizeof(struct param_kobject) - + sizeof(struct param_attribute) * valid_attrs, - GFP_KERNEL); - if (!pk) - return ERR_PTR(-ENOMEM); - memset(pk, 0, sizeof(struct param_kobject) - + sizeof(struct param_attribute) * valid_attrs); + size[0] = ALIGN(sizeof(*mp) + + valid_attrs * sizeof(mp->attrs[0]), + sizeof(mp->grp.attrs[0])); + size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); - err = kobject_set_name(&pk->kobj, "parameters"); - if (err) - goto out; + mp = kmalloc(size[0] + size[1], GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); - pk->kobj.kset = ¶m_kset; - pk->kobj.parent = &mk->kobj; - err = kobject_register(&pk->kobj); - if (err) - goto out; + mp->grp.name = "parameters"; + mp->grp.attrs = (void *)mp + size[0]; + pattr = &mp->attrs[0]; + gattr = &mp->grp.attrs[0]; for (i = 0; i < num_params; i++) { - if (kparam[i].perm) { - err = param_add_attribute(mk->mod, pk, - &kparam[i], name_skip); - if (err) - goto out_unreg; + struct kernel_param *kp = &kparam[i]; + if (kp->perm) { + pattr->param = kp; + pattr->mattr.show = param_attr_show; + pattr->mattr.store = param_attr_store; + pattr->mattr.attr.name = (char *)&kp->name[name_skip]; + pattr->mattr.attr.owner = mk->mod; + pattr->mattr.attr.mode = kp->perm; + *(gattr++) = &(pattr++)->mattr.attr; } } + *gattr = NULL; - return pk; - -out_unreg: - param_sysfs_remove(pk); - return ERR_PTR(err); - -out: - kfree(pk); - return ERR_PTR(err); + if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { + kfree(mp); + return ERR_PTR(err); + } + return mp; } @@ -565,13 +501,13 @@ int module_param_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { - struct param_kobject *pk; + struct module_param_attrs *mp; - pk = param_sysfs_setup(mod->mkobj, kparam, num_params, 0); - if (IS_ERR(pk)) - return PTR_ERR(pk); + mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); + if (IS_ERR(mp)) + return PTR_ERR(mp); - mod->params_kobject = pk; + mod->param_attrs = mp; return 0; } @@ -584,9 +520,13 @@ int module_param_sysfs_setup(struct module *mod, */ void module_param_sysfs_remove(struct module *mod) { - if (mod->params_kobject) { - param_sysfs_remove(mod->params_kobject); - mod->params_kobject = NULL; + if (mod->param_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->param_attrs->grp); + /* We are positive that no one is using any param + * attrs at this point. Deallocate immediately. */ + kfree(mod->param_attrs); + mod->param_attrs = NULL; } } #endif @@ -610,8 +550,10 @@ static void __init kernel_param_sysfs_setup(const char *name, kobject_register(&mk->kobj); /* no need to keep the kobject if no parameter is exported */ - if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) + if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { kobject_unregister(&mk->kobj); + kfree(mk); + } } /* @@ -691,7 +633,7 @@ static ssize_t module_attr_show(struct kobject *kobj, if (!try_module_get(mk->mod)) return -ENODEV; - ret = attribute->show(mk->mod, buf); + ret = attribute->show(attribute, mk->mod, buf); module_put(mk->mod); @@ -710,14 +652,8 @@ static struct sysfs_ops module_sysfs_ops = { }; #endif -static void module_kobj_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct module_kobject, kobj)); -} - static struct kobj_type module_ktype = { .sysfs_ops = &module_sysfs_ops, - .release = &module_kobj_release, }; decl_subsys(module, &module_ktype, NULL); @@ -728,8 +664,6 @@ decl_subsys(module, &module_ktype, NULL); static int __init param_sysfs_init(void) { subsystem_register(&module_subsys); - kobject_set_name(¶m_kset.kobj, "parameters"); - kset_init(¶m_kset); param_sysfs_builtin(); diff --git a/kernel/pid.c b/kernel/pid.c index 185a8bee8168..edba31c681ac 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -60,7 +60,7 @@ typedef struct pidmap { static pidmap_t pidmap_array[PIDMAP_ENTRIES] = { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; -static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); fastcall void free_pidmap(int pid) { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 33a67e7ad826..9e79eca513ca 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -85,7 +85,7 @@ static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2) */ static kmem_cache_t *posix_timers_cache; static struct idr posix_timers_id; -static spinlock_t idr_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(idr_lock); /* * Just because the timer is not in the timer list does NOT mean it is diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ed49ffd12bc6..696387ffe49c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -48,7 +48,7 @@ config SOFTWARE_SUSPEND involved in suspending. Also in this case there is a risk that buffers on disk won't match with saved ones. - For more information take a look at Documentation/power/swsusp.txt. + For more information take a look at <file:Documentation/power/swsusp.txt>. config PM_STD_PARTITION string "Default resume partition" diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0f5dc712ad70..b9b3f5881677 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -51,7 +51,7 @@ static void power_down(suspend_disk_method_t mode) local_irq_save(flags); switch(mode) { case PM_DISK_PLATFORM: - device_power_down(PM_SUSPEND_DISK); + device_power_down(PMSG_SUSPEND); error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: @@ -144,8 +144,10 @@ static int prepare(void) free_some_memory(); disable_nonboot_cpus(); - if ((error = device_suspend(PM_SUSPEND_DISK))) + if ((error = device_suspend(PMSG_FREEZE))) { + printk("Some devices failed to suspend\n"); goto Finish; + } return 0; Finish: @@ -163,7 +165,7 @@ static int prepare(void) * * If we're going through the firmware, then get it over with quickly. * - * If not, then call swsusp to do it's thing, then figure out how + * If not, then call swsusp to do its thing, then figure out how * to power down the system. */ @@ -201,7 +203,7 @@ int pm_suspend_disk(void) * software_resume - Resume from a saved image. * * Called as a late_initcall (so all devices are discovered and - * initialized), we call pmdisk to see if we have a saved image or not. + * initialized), we call swsusp to see if we have a saved image or not. * If so, we quiesce devices, the restore the saved image. We will * return above (in pm_suspend_disk() ) if everything goes well. * Otherwise, we fail gracefully and return to the normally @@ -221,7 +223,7 @@ static int software_resume(void) return 0; } - pr_debug("PM: Reading pmdisk image.\n"); + pr_debug("PM: Reading swsusp image.\n"); if ((error = swsusp_read())) goto Done; @@ -284,7 +286,7 @@ static char * pm_disk_modes[] = { static ssize_t disk_show(struct subsystem * subsys, char * buf) { - return sprintf(buf,"%s\n",pm_disk_modes[pm_disk_mode]); + return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); } diff --git a/kernel/power/main.c b/kernel/power/main.c index 0aefb03ede09..b7ef95c96230 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -65,7 +65,7 @@ static int suspend_prepare(suspend_state_t state) goto Thaw; } - if ((error = device_suspend(state))) + if ((error = device_suspend(PMSG_SUSPEND))) goto Finish; return 0; Finish: @@ -78,13 +78,14 @@ static int suspend_prepare(suspend_state_t state) } -static int suspend_enter(u32 state) +static int suspend_enter(suspend_state_t state) { int error = 0; unsigned long flags; local_irq_save(flags); - if ((error = device_power_down(state))) + + if ((error = device_power_down(PMSG_SUSPEND))) goto Done; error = pm_ops->enter(state); device_power_up(); @@ -99,7 +100,7 @@ static int suspend_enter(u32 state) * @state: State we're coming out of. * * Call platform code to clean up, restart processes, and free the - * console that we've allocated. + * console that we've allocated. This is not called for suspend-to-disk. */ static void suspend_finish(suspend_state_t state) diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 4bf7fe6d2feb..22cdaa42d922 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -67,12 +67,13 @@ #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> +#include <asm/tlbflush.h> #include <asm/io.h> #include "power.h" /* References to section boundaries */ -extern char __nosave_begin, __nosave_end; +extern const void __nosave_begin, __nosave_end; /* Variables to be preserved over suspend */ static int pagedir_order_check; @@ -419,7 +420,7 @@ struct highmem_page { struct highmem_page *next; }; -struct highmem_page *highmem_copy = NULL; +static struct highmem_page *highmem_copy; static int save_highmem_zone(struct zone *zone) { @@ -752,11 +753,11 @@ static int swsusp_alloc(void) return -ENOSPC; if ((error = alloc_pagedir())) { - pr_debug("suspend: Allocating pagedir failed.\n"); + printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return error; } if ((error = alloc_image_pages())) { - pr_debug("suspend: Allocating image pages failed.\n"); + printk(KERN_ERR "suspend: Allocating image pages failed.\n"); swsusp_free(); return error; } @@ -766,7 +767,7 @@ static int swsusp_alloc(void) return 0; } -int suspend_prepare_image(void) +static int suspend_prepare_image(void) { int error; @@ -842,11 +843,22 @@ int swsusp_suspend(void) if ((error = arch_prepare_suspend())) return error; local_irq_disable(); + /* At this point, device_suspend() has been called, but *not* + * device_power_down(). We *must* device_power_down() now. + * Otherwise, drivers for some devices (e.g. interrupt controllers) + * become desynchronized with the actual state of the hardware + * at resume time, and evil weirdness ensues. + */ + if ((error = device_power_down(PMSG_FREEZE))) { + local_irq_enable(); + return error; + } save_processor_state(); error = swsusp_arch_suspend(); /* Restore control flow magically appears here */ restore_processor_state(); restore_highmem(); + device_power_up(); local_irq_enable(); return error; } @@ -866,6 +878,7 @@ int swsusp_resume(void) { int error; local_irq_disable(); + device_power_down(PMSG_FREEZE); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = swsusp_arch_resume(); @@ -875,6 +888,7 @@ int swsusp_resume(void) BUG_ON(!error); restore_processor_state(); restore_highmem(); + device_power_up(); local_irq_enable(); return error; } @@ -1036,12 +1050,12 @@ static int submit(int rw, pgoff_t page_off, void * page) return error; } -int bio_read_page(pgoff_t page_off, void * page) +static int bio_read_page(pgoff_t page_off, void * page) { return submit(READ, page_off, page); } -int bio_write_page(pgoff_t page_off, void * page) +static int bio_write_page(pgoff_t page_off, void * page) { return submit(WRITE, page_off, page); } @@ -1158,7 +1172,7 @@ static int __init read_pagedir(void) return -ENOMEM; pagedir_nosave = (struct pbe *)addr; - pr_debug("pmdisk: Reading pagedir (%d Pages)\n",n); + pr_debug("swsusp: Reading pagedir (%d Pages)\n",n); for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) { unsigned long offset = swp_offset(swsusp_info.pagedir[i]); diff --git a/kernel/printk.c b/kernel/printk.c index 4e9fd492f30e..d914a90d6206 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -78,7 +78,7 @@ static int console_locked; * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static spinlock_t logbuf_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(logbuf_lock); static char __log_buf[__LOG_BUF_LEN]; static char *log_buf = __log_buf; @@ -284,6 +284,7 @@ int do_syslog(int type, char __user * buf, int len) error = __put_user(c,buf); buf++; i++; + cond_resched(); spin_lock_irq(&logbuf_lock); } spin_unlock_irq(&logbuf_lock); @@ -325,6 +326,7 @@ int do_syslog(int type, char __user * buf, int len) c = LOG_BUF(j); spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); + cond_resched(); spin_lock_irq(&logbuf_lock); } spin_unlock_irq(&logbuf_lock); @@ -340,6 +342,7 @@ int do_syslog(int type, char __user * buf, int len) error = -EFAULT; break; } + cond_resched(); } } break; @@ -642,8 +645,9 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock_irqrestore(&logbuf_lock, flags); + spin_unlock(&logbuf_lock); call_console_drivers(_con_start, _log_end); + local_irq_restore(flags); } console_locked = 0; console_may_schedule = 0; @@ -871,7 +875,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(ratelimit_lock); static unsigned long toks = 10*5*HZ; static unsigned long last_msg; static int missed; diff --git a/kernel/profile.c b/kernel/profile.c index ff62fa98328a..a38fa70075fe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -83,7 +83,7 @@ void __init profile_init(void) #ifdef CONFIG_PROFILING static DECLARE_RWSEM(profile_rwsem); -static rwlock_t handoff_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(handoff_lock); static struct notifier_block * task_exit_notifier; static struct notifier_block * task_free_notifier; static struct notifier_block * munmap_notifier; @@ -326,17 +326,15 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, GFP_KERNEL, 0); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) return NOTIFY_BAD; - clear_highpage(page); per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, GFP_KERNEL, 0); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) goto out_free; - clear_highpage(page); per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); } break; @@ -510,16 +508,14 @@ static int __init create_hash_tables(void) int node = cpu_to_node(cpu); struct page *page; - page = alloc_pages_node(node, GFP_KERNEL, 0); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) goto out_cleanup; - clear_highpage(page); per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, GFP_KERNEL, 0); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) goto out_cleanup; - clear_highpage(page); per_cpu(cpu_profile_hits, cpu)[0] = (struct profile_hit *)page_address(page); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 60801c692810..136a8feba91e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -38,10 +38,24 @@ void __ptrace_link(task_t *child, task_t *new_parent) SET_LINKS(child); } -static inline int pending_resume_signal(struct sigpending *pending) +/* + * Turn a tracing stop into a normal stop now, since with no tracer there + * would be no way to wake it up with SIGCONT or SIGKILL. If there was a + * signal sent that would resume the child, but didn't because it was in + * TASK_TRACED, resume it now. + * Requires that irqs be disabled. + */ +void ptrace_untrace(task_t *child) { -#define M(sig) (1UL << ((sig)-1)) - return sigtestsetmask(&pending->signal, M(SIGCONT) | M(SIGKILL)); + spin_lock(&child->sighand->siglock); + if (child->state == TASK_TRACED) { + if (child->signal->flags & SIGNAL_STOP_STOPPED) { + child->state = TASK_STOPPED; + } else { + signal_wake_up(child, 1); + } + } + spin_unlock(&child->sighand->siglock); } /* @@ -55,29 +69,15 @@ void __ptrace_unlink(task_t *child) if (!child->ptrace) BUG(); child->ptrace = 0; - if (list_empty(&child->ptrace_list)) - return; - list_del_init(&child->ptrace_list); - REMOVE_LINKS(child); - child->parent = child->real_parent; - SET_LINKS(child); - - if (child->state == TASK_TRACED) { - /* - * Turn a tracing stop into a normal stop now, - * since with no tracer there would be no way - * to wake it up with SIGCONT or SIGKILL. - * If there was a signal sent that would resume the child, - * but didn't because it was in TASK_TRACED, resume it now. - */ - spin_lock(&child->sighand->siglock); - child->state = TASK_STOPPED; - if (pending_resume_signal(&child->pending) || - pending_resume_signal(&child->signal->shared_pending)) { - signal_wake_up(child, 1); - } - spin_unlock(&child->sighand->siglock); + if (!list_empty(&child->ptrace_list)) { + list_del_init(&child->ptrace_list); + REMOVE_LINKS(child); + child->parent = child->real_parent; + SET_LINKS(child); } + + if (child->state == TASK_TRACED) + ptrace_untrace(child); } /* @@ -319,18 +319,33 @@ static int ptrace_setoptions(struct task_struct *child, long data) static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) { - if (child->last_siginfo == NULL) - return -EINVAL; - return copy_siginfo_to_user(data, child->last_siginfo); + siginfo_t lastinfo; + + spin_lock_irq(&child->sighand->siglock); + if (likely(child->last_siginfo != NULL)) { + memcpy(&lastinfo, child->last_siginfo, sizeof (siginfo_t)); + spin_unlock_irq(&child->sighand->siglock); + return copy_siginfo_to_user(data, &lastinfo); + } + spin_unlock_irq(&child->sighand->siglock); + return -EINVAL; } static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) { - if (child->last_siginfo == NULL) - return -EINVAL; - if (copy_from_user(child->last_siginfo, data, sizeof (siginfo_t)) != 0) + siginfo_t newinfo; + + if (copy_from_user(&newinfo, data, sizeof (siginfo_t)) != 0) return -EFAULT; - return 0; + + spin_lock_irq(&child->sighand->siglock); + if (likely(child->last_siginfo != NULL)) { + memcpy(child->last_siginfo, &newinfo, sizeof (siginfo_t)); + spin_unlock_irq(&child->sighand->siglock); + return 0; + } + spin_unlock_irq(&child->sighand->siglock); + return -EINVAL; } int ptrace_request(struct task_struct *child, long request, diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 80cac1cd0859..f0ae3c3c013e 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -49,9 +49,9 @@ /* Definition for rcupdate control block. */ struct rcu_ctrlblk rcu_ctrlblk = - { .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO }; + { .cur = -300, .completed = -300 }; struct rcu_ctrlblk rcu_bh_ctrlblk = - { .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO }; + { .cur = -300, .completed = -300 }; /* Bookkeeping of the progress of the grace period */ struct rcu_state { @@ -60,9 +60,9 @@ struct rcu_state { /* for current batch to proceed. */ }; -struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = +static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; -struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = +static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@ -185,10 +185,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, rcp->completed == rcp->cur) { /* Can't change, since spin lock held. */ cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); - write_seqcount_begin(&rcp->lock); + rcp->next_pending = 0; + /* next_pending == 0 must be visible in __rcu_process_callbacks() + * before it can see new value of cur. + */ + smp_wmb(); rcp->cur++; - write_seqcount_end(&rcp->lock); } } @@ -216,9 +219,9 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) { if (rdp->quiescbatch != rcp->cur) { - /* new grace period: record qsctr value. */ + /* start new grace period: */ rdp->qs_pending = 1; - rdp->last_qsctr = rdp->qsctr; + rdp->passed_quiesc = 0; rdp->quiescbatch = rcp->cur; return; } @@ -231,11 +234,10 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; /* - * Races with local timer interrupt - in the worst case - * we may miss one quiescent state of that CPU. That is - * tolerable. So no need to disable interrupts. + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. */ - if (rdp->qsctr == rdp->last_qsctr) + if (!rdp->passed_quiesc) return; rdp->qs_pending = 0; @@ -319,8 +321,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_disable(); if (rdp->nxtlist && !rdp->curlist) { - int next_pending, seq; - rdp->curlist = rdp->nxtlist; rdp->curtail = rdp->nxttail; rdp->nxtlist = NULL; @@ -330,14 +330,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, /* * start the next batch of callbacks */ - do { - seq = read_seqcount_begin(&rcp->lock); - /* determine batch number */ - rdp->batch = rcp->cur + 1; - next_pending = rcp->next_pending; - } while (read_seqcount_retry(&rcp->lock, seq)); - - if (!next_pending) { + + /* determine batch number */ + rdp->batch = rcp->cur + 1; + /* see the comment and corresponding wmb() in + * the rcu_start_batch() + */ + smp_rmb(); + + if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ spin_lock(&rsp->lock); rcu_start_batch(rcp, rsp, 1); diff --git a/kernel/resource.c b/kernel/resource.c index 5f013dc4f649..72596bc6fdaf 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -39,7 +39,7 @@ struct resource iomem_resource = { EXPORT_SYMBOL(iomem_resource); -static rwlock_t resource_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(resource_lock); #ifdef CONFIG_PROC_FS diff --git a/kernel/sched.c b/kernel/sched.c index 43bfde70a34c..099f946ed40c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -36,6 +36,7 @@ #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/smp.h> +#include <linux/threads.h> #include <linux/timer.h> #include <linux/rcupdate.h> #include <linux/cpu.h> @@ -48,12 +49,6 @@ #include <asm/unistd.h> -#ifdef CONFIG_NUMA -#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) -#else -#define cpu_to_node_mask(cpu) (cpu_online_map) -#endif - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -97,7 +92,6 @@ #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define CREDIT_LIMIT 100 /* * If a task is 'interactive' then we reinsert it in the active @@ -131,12 +125,14 @@ (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG) +#define GRANULARITY (10 * HZ / 1000 ? : 1) + #ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ num_online_cpus()) #else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) #endif @@ -153,12 +149,6 @@ (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) - -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) - #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) @@ -448,11 +438,21 @@ static runqueue_t *this_rq_lock(void) return rq; } -static inline void rq_unlock(runqueue_t *rq) - __releases(rq->lock) +#ifdef CONFIG_SCHED_SMT +static int cpu_and_siblings_are_idle(int cpu) { - spin_unlock_irq(&rq->lock); + int sib; + for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { + if (idle_cpu(sib)) + continue; + return 0; + } + + return 1; } +#else +#define cpu_and_siblings_are_idle(A) idle_cpu(A) +#endif #ifdef CONFIG_SCHEDSTATS /* @@ -581,10 +581,14 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) } /* - * Used by the migration code - we pull tasks from the head of the - * remote queue so we want these tasks to show up at the head of the - * local queue: + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. */ +static void requeue_task(struct task_struct *p, prio_array_t *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) { list_add(&p->run_list, array->queue + p->prio); @@ -663,8 +667,6 @@ static void recalc_task_prio(task_t *p, unsigned long long now) sleep_time > INTERACTIVE_SLEEP(p)) { p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - DEF_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; } else { /* * The lower the sleep avg a task has the more @@ -673,19 +675,11 @@ static void recalc_task_prio(task_t *p, unsigned long long now) sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); - - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { + if (p->activated == -1 && p->mm) { if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) sleep_time = 0; else if (p->sleep_avg + sleep_time >= @@ -705,11 +699,8 @@ static void recalc_task_prio(task_t *p, unsigned long long now) */ p->sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { + if (p->sleep_avg > NS_MAX_SLEEP_AVG) p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } } } @@ -934,9 +925,10 @@ static inline unsigned long target_load(int cpu) #endif /* - * wake_idle() is useful especially on SMT architectures to wake a - * task onto an idle sibling if we would otherwise wake it onto a - * busy sibling. + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. * * Returns the CPU we should wake onto. */ @@ -944,24 +936,23 @@ static inline unsigned long target_load(int cpu) static int wake_idle(int cpu, task_t *p) { cpumask_t tmp; - runqueue_t *rq = cpu_rq(cpu); struct sched_domain *sd; int i; if (idle_cpu(cpu)) return cpu; - sd = rq->sd; - if (!(sd->flags & SD_WAKE_IDLE)) - return cpu; - - cpus_and(tmp, sd->span, p->cpus_allowed); - - for_each_cpu_mask(i, tmp) { - if (idle_cpu(i)) - return i; + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + } + else break; } - return cpu; } #else @@ -1073,7 +1064,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) out_set_cpu: schedstat_inc(rq, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); - if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { + if (new_cpu != cpu) { schedstat_inc(rq, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); @@ -1126,7 +1117,7 @@ out: int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1191,7 +1182,7 @@ void fastcall sched_fork(task_t *p) */ current->time_slice = 1; preempt_disable(); - scheduler_tick(0, 0); + scheduler_tick(); local_irq_enable(); preempt_enable(); } else @@ -1227,8 +1218,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->interactive_credit = 0; - p->prio = effective_prio(p); if (likely(cpu == this_cpu)) { @@ -1667,13 +1656,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; - /* Aggressive migration if we've failed balancing */ - if (idle == NEWLY_IDLE || - sd->nr_balance_failed < sd->cache_nice_tries) { - if (task_hot(p, rq->timestamp_last_tick, sd)) - return 0; - } + /* + * Aggressive migration if: + * 1) the [whole] cpu is idle, or + * 2) too many balance attempts have failed. + */ + + if (cpu_and_siblings_are_idle(this_cpu) || \ + sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; return 1; } @@ -2009,7 +2003,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, if (sd->balance_interval < sd->max_interval) sd->balance_interval++; } else { - sd->nr_balance_failed = 0; + sd->nr_balance_failed = 0; /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; @@ -2088,23 +2082,6 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) } } -#ifdef CONFIG_SCHED_SMT -static int cpu_and_siblings_are_idle(int cpu) -{ - int sib; - for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { - if (idle_cpu(sib)) - continue; - return 0; - } - - return 1; -} -#else -#define cpu_and_siblings_are_idle(A) idle_cpu(A) -#endif - - /* * active_load_balance is run by migration threads. It pushes running tasks * off the busiest CPU onto idle CPUs. It requires at least 1 task to be @@ -2117,7 +2094,9 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) { struct sched_domain *sd; struct sched_group *cpu_group; + runqueue_t *target_rq; cpumask_t visited_cpus; + int cpu; schedstat_inc(busiest_rq, alb_cnt); /* @@ -2126,46 +2105,43 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) */ visited_cpus = CPU_MASK_NONE; for_each_domain(busiest_cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE) || busiest_rq->nr_running <= 1) - break; /* no more domains to search or no more tasks to move */ + if (!(sd->flags & SD_LOAD_BALANCE)) + /* no more domains to search */ + break; cpu_group = sd->groups; - do { /* sched_groups should either use list_heads or be merged into the domains structure */ - int cpu, target_cpu = -1; - runqueue_t *target_rq; - + do { for_each_cpu_mask(cpu, cpu_group->cpumask) { - if (cpu_isset(cpu, visited_cpus) || cpu == busiest_cpu || - !cpu_and_siblings_are_idle(cpu)) { - cpu_set(cpu, visited_cpus); + if (busiest_rq->nr_running <= 1) + /* no more tasks left to move */ + return; + if (cpu_isset(cpu, visited_cpus)) + continue; + cpu_set(cpu, visited_cpus); + if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) continue; - } - target_cpu = cpu; - break; - } - if (target_cpu == -1) - goto next_group; /* failed to find a suitable target cpu in this domain */ - - target_rq = cpu_rq(target_cpu); - /* - * This condition is "impossible", if it occurs we need to fix it - * Reported by Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE)) { - schedstat_inc(busiest_rq, alb_lost); - schedstat_inc(target_rq, alb_gained); - } else { - schedstat_inc(busiest_rq, alb_failed); + target_rq = cpu_rq(cpu); + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + if (move_tasks(target_rq, cpu, busiest_rq, + 1, sd, SCHED_IDLE)) { + schedstat_inc(busiest_rq, alb_lost); + schedstat_inc(target_rq, alb_gained); + } else { + schedstat_inc(busiest_rq, alb_failed); + } + spin_unlock(&target_rq->lock); } - spin_unlock(&target_rq->lock); -next_group: cpu_group = cpu_group->next; - } while (cpu_group != sd->groups && busiest_rq->nr_running > 1); + } while (cpu_group != sd->groups); } } @@ -2275,48 +2251,172 @@ EXPORT_PER_CPU_SYMBOL(kstat); ((rq)->curr->static_prio > (rq)->best_expired_prio)) /* + * Do the virtual cpu time signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + */ +static inline void account_it_virt(struct task_struct * p, cputime_t cputime) +{ + cputime_t it_virt = p->it_virt_value; + + if (cputime_gt(it_virt, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_virt)) { + it_virt = cputime_add(it_virt, p->it_virt_incr); + send_sig(SIGVTALRM, p, 1); + } + it_virt = cputime_sub(it_virt, cputime); + p->it_virt_value = it_virt; + } +} + +/* + * Do the virtual profiling signal calculations. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void account_it_prof(struct task_struct *p, cputime_t cputime) +{ + cputime_t it_prof = p->it_prof_value; + + if (cputime_gt(it_prof, cputime_zero) && + cputime_gt(cputime, cputime_zero)) { + if (cputime_ge(cputime, it_prof)) { + it_prof = cputime_add(it_prof, p->it_prof_incr); + send_sig(SIGPROF, p, 1); + } + it_prof = cputime_sub(it_prof, cputime); + p->it_prof_value = it_prof; + } +} + +/* + * Check if the process went over its cputime resource limit after + * some cpu time got added to utime/stime. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user and kernel space since the last update + */ +static void check_rlimit(struct task_struct *p, cputime_t cputime) +{ + cputime_t total, tmp; + + total = cputime_add(p->utime, p->stime); + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur); + if (unlikely(cputime_gt(total, tmp))) { + /* Send SIGXCPU every second. */ + tmp = cputime_sub(total, cputime); + if (cputime_to_secs(tmp) < cputime_to_secs(total)) + send_sig(SIGXCPU, p, 1); + /* and SIGKILL when we go over max.. */ + tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); + if (cputime_gt(total, tmp)) + send_sig(SIGKILL, p, 1); + } +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */ + check_rlimit(p, cputime); + account_it_virt(p, cputime); + account_it_prof(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */ + if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) { + check_rlimit(p, cputime); + account_it_prof(p, cputime); + } + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (p != rq->idle) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp = cputime_to_cputime64(steal); + runqueue_t *rq = this_rq(); + + if (p == rq->idle) { + p->stime = cputime_add(p->stime, steal); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + } else + cpustat->steal = cputime64_add(cpustat->steal, tmp); +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * * It also gets called by the fork code, when changing the parent's * timeslices. */ -void scheduler_tick(int user_ticks, int sys_ticks) +void scheduler_tick(void) { int cpu = smp_processor_id(); - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; runqueue_t *rq = this_rq(); task_t *p = current; rq->timestamp_last_tick = sched_clock(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, user_ticks); - - /* note: this timer irq context must be accounted for as well */ - if (hardirq_count() - HARDIRQ_OFFSET) { - cpustat->irq += sys_ticks; - sys_ticks = 0; - } else if (softirq_count()) { - cpustat->softirq += sys_ticks; - sys_ticks = 0; - } - if (p == rq->idle) { - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait += sys_ticks; - else - cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; rebalance_tick(cpu, rq, SCHED_IDLE); return; } - if (TASK_NICE(p) > 0) - cpustat->nice += user_ticks; - else - cpustat->user += user_ticks; - cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { @@ -2342,8 +2442,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + requeue_task(p, rq->active); } goto out_unlock; } @@ -2384,10 +2483,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) (p->time_slice >= TIMESLICE_GRANULARITY(p)) && (p->array == rq->active)) { - dequeue_task(p, rq->active); + requeue_task(p, rq->active); set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); } } out_unlock: @@ -2521,6 +2618,38 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) } #endif +#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + /* * schedule() is the main scheduler function. */ @@ -2540,7 +2669,7 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + if (likely(!current->exit_state)) { if (unlikely(in_atomic())) { printk(KERN_ERR "scheduling while atomic: " "%s/0x%08x/%d\n", @@ -2561,7 +2690,7 @@ need_resched_nonpreemptible: * The idle thread is not allowed to schedule! * Remove this check after it has been exercised a bit. */ - if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) { + if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); } @@ -2574,21 +2703,16 @@ need_resched_nonpreemptible: run_time = NS_MAX_SLEEP_AVG; /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); + run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); - if (unlikely(current->flags & PF_DEAD)) - current->state = EXIT_DEAD; - /* - * if entering off of a kernel preemption go straight - * to picking the next task. - */ + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; @@ -2670,11 +2794,8 @@ switch_tasks: rcu_qsctr_inc(task_cpu(prev)); prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { + if ((long)prev->sleep_avg <= 0) prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); @@ -2711,7 +2832,10 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); - +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -2720,9 +2844,21 @@ asmlinkage void __sched preempt_schedule(void) return; need_resched: - ti->preempt_count = PREEMPT_ACTIVE; + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif schedule(); - ti->preempt_count = 0; +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); /* we could miss a preemption opportunity between schedule and now */ barrier(); @@ -2955,7 +3091,7 @@ void set_user_nice(task_t *p, long nice) */ rq = task_rq_lock(p, &flags); /* - * The RT priorities are set via setscheduler(), but we still + * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is * not SCHED_NORMAL: @@ -3055,6 +3191,15 @@ int task_nice(const task_t *p) return TASK_NICE(p); } +/* + * The only users of task_nice are binfmt_elf and binfmt_elf32. + * binfmt_elf is no longer modular, but binfmt_elf32 still is. + * Therefore, task_nice is needed if there is a compat_mode. + */ +#ifdef CONFIG_COMPAT +EXPORT_SYMBOL_GPL(task_nice); +#endif + /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. @@ -3096,67 +3241,48 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) p->prio = p->static_prio; } -/* - * setscheduler - change the scheduling policy and/or RT priority of a thread. +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of + * a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. */ -static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - struct sched_param lp; - int retval = -EINVAL; + int retval; int oldprio, oldpolicy = -1; prio_array_t *array; unsigned long flags; runqueue_t *rq; - task_t *p; - - if (!param || pid < 0) - goto out_nounlock; - retval = -EFAULT; - if (copy_from_user(&lp, param, sizeof(struct sched_param))) - goto out_nounlock; - - /* - * We play safe to avoid deadlocks. - */ - read_lock_irq(&tasklist_lock); - - p = find_process_by_pid(pid); - - retval = -ESRCH; - if (!p) - goto out_unlock; recheck: /* double check policy once rq lock held */ if (policy < 0) policy = oldpolicy = p->policy; - else { - retval = -EINVAL; - if (policy != SCHED_FIFO && policy != SCHED_RR && + else if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL) - goto out_unlock; - } + return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ - retval = -EINVAL; - if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) - goto out_unlock; - if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) - goto out_unlock; + if (param->sched_priority < 0 || + param->sched_priority > MAX_USER_RT_PRIO-1) + return -EINVAL; + if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) + return -EINVAL; - retval = -EPERM; if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) - goto out_unlock; + return -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) - goto out_unlock; + return -EPERM; - retval = security_task_setscheduler(p, policy, &lp); + retval = security_task_setscheduler(p, policy, param); if (retval) - goto out_unlock; + return retval; /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. @@ -3170,12 +3296,11 @@ recheck: } array = p->array; if (array) - deactivate_task(p, task_rq(p)); - retval = 0; + deactivate_task(p, rq); oldprio = p->prio; - __setscheduler(p, policy, lp.sched_priority); + __setscheduler(p, policy, param->sched_priority); if (array) { - __activate_task(p, task_rq(p)); + __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3188,22 +3313,41 @@ recheck: resched_task(rq->curr); } task_rq_unlock(rq, &flags); -out_unlock: + return 0; +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + int retval; + struct sched_param lparam; + struct task_struct *p; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + read_lock_irq(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) { + read_unlock_irq(&tasklist_lock); + return -ESRCH; + } + retval = sched_setscheduler(p, policy, &lparam); read_unlock_irq(&tasklist_lock); -out_nounlock: return retval; } /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. - * @policy: new policy + * @policy: new policy. * @param: structure containing the new RT priority. */ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { - return setscheduler(pid, policy, param); + return do_sched_setscheduler(pid, policy, param); } /** @@ -3213,7 +3357,7 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, */ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) { - return setscheduler(pid, -1, param); + return do_sched_setscheduler(pid, -1, param); } /** @@ -3444,8 +3588,14 @@ asmlinkage long sys_sched_yield(void) } else if (!rq->expired->nr_active) schedstat_inc(rq, yld_exp_empty); - dequeue_task(current, array); - enqueue_task(current, target); + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); /* * Since we are going to call schedule() anyway, there's @@ -3460,13 +3610,71 @@ asmlinkage long sys_sched_yield(void) return 0; } -void __sched __cond_resched(void) +static inline void __cond_resched(void) { - set_current_state(TASK_RUNNING); - schedule(); + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched cond_resched(void) +{ + if (need_resched()) { + __cond_resched(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } +#endif + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched()) { + __local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; } -EXPORT_SYMBOL(__cond_resched); +EXPORT_SYMBOL(cond_resched_softirq); + /** * yield - yield the current processor to other threads. @@ -3491,7 +3699,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = this_rq(); + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); atomic_inc(&rq->nr_iowait); schedule(); @@ -3502,7 +3710,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = this_rq(); + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; atomic_inc(&rq->nr_iowait); @@ -3699,7 +3907,6 @@ void __devinit init_idle(task_t *idle, int cpu) unsigned long flags; idle->sleep_avg = 0; - idle->interactive_credit = 0; idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; @@ -3711,7 +3918,7 @@ void __devinit init_idle(task_t *idle, int cpu) spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) idle->thread_info->preempt_count = (idle->lock_depth >= 0); #else idle->thread_info->preempt_count = 0; @@ -3879,8 +4086,7 @@ static int migration_thread(void * data) if (req->type == REQ_MOVE_TASK) { spin_unlock(&rq->lock); - __migrate_task(req->task, smp_processor_id(), - req->dest_cpu); + __migrate_task(req->task, cpu, req->dest_cpu); local_irq_enable(); } else if (req->type == REQ_SET_DOMAIN) { rq->sd = req->sd; @@ -4004,6 +4210,20 @@ void sched_idle_next(void) spin_unlock_irqrestore(&rq->lock, flags); } +/* Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + static void migrate_dead(unsigned int dead_cpu, task_t *tsk) { struct runqueue *rq = cpu_rq(dead_cpu); @@ -4136,6 +4356,94 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP +#define SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + do { + int i; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (i = 0; i < level + 1; i++) + printk(" "); + printk("domain %d: ", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + break; + } + + printk("span %s\n", str); + + if (!cpu_isset(cpu, sd->span)) + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + if (!cpu_isset(cpu, group->cpumask)) + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + + printk(KERN_DEBUG); + for (i = 0; i < level + 2; i++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->cpu_power) { + printk("\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + } + + if (!cpus_weight(group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: empty group\n"); + } + + if (cpus_intersects(groupmask, group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + } + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); + } + + } while (sd); +} +#else +#define sched_domain_debug(sd, cpu) {} +#endif + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -4147,6 +4455,8 @@ void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) runqueue_t *rq = cpu_rq(cpu); int local = 1; + sched_domain_debug(sd, cpu); + spin_lock_irqsave(&rq->lock, flags); if (cpu == smp_processor_id() || !cpu_online(cpu)) { @@ -4178,7 +4488,8 @@ static int __init isolated_cpu_setup(char *str) str = get_options(str, ARRAY_SIZE(ints), ints); cpus_clear(cpu_isolated_map); for (i = 1; i <= ints[0]; i++) - cpu_set(ints[i], cpu_isolated_map); + if (ints[i] < NR_CPUS) + cpu_set(ints[i], cpu_isolated_map); return 1; } @@ -4421,96 +4732,6 @@ static void __devinit arch_destroy_sched_domains(void) #endif /* ARCH_HAS_SCHED_DOMAIN */ -#define SCHED_DOMAIN_DEBUG -#ifdef SCHED_DOMAIN_DEBUG -static void sched_domain_debug(void) -{ - int i; - - for_each_online_cpu(i) { - runqueue_t *rq = cpu_rq(i); - struct sched_domain *sd; - int level = 0; - - sd = rq->sd; - - printk(KERN_DEBUG "CPU%d:\n", i); - - do { - int j; - char str[NR_CPUS]; - struct sched_group *group = sd->groups; - cpumask_t groupmask; - - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); - - printk(KERN_DEBUG); - for (j = 0; j < level + 1; j++) - printk(" "); - printk("domain %d: ", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance"); - if (sd->parent) - printk(" ERROR !SD_LOAD_BALANCE domain has parent"); - printk("\n"); - break; - } - - printk("span %s\n", str); - - if (!cpu_isset(i, sd->span)) - printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); - if (!cpu_isset(i, group->cpumask)) - printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); - - printk(KERN_DEBUG); - for (j = 0; j < level + 2; j++) - printk(" "); - printk("groups:"); - do { - if (!group) { - printk(" ERROR: NULL"); - break; - } - - if (!group->cpu_power) - printk(KERN_DEBUG "ERROR group->cpu_power not set\n"); - - if (!cpus_weight(group->cpumask)) - printk(" ERROR empty group:"); - - if (cpus_intersects(groupmask, group->cpumask)) - printk(" ERROR repeated CPUs:"); - - cpus_or(groupmask, groupmask, group->cpumask); - - cpumask_scnprintf(str, NR_CPUS, group->cpumask); - printk(" %s", str); - - group = group->next; - } while (group != sd->groups); - printk("\n"); - - if (!cpus_equal(sd->span, groupmask)) - printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); - - level++; - sd = sd->parent; - - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); - } - - } while (sd); - } -} -#else -#define sched_domain_debug() {} -#endif - /* * Initial dummy domain for early boot and for hotplug cpu. Being static, * it is initialized to zero, so all balancing flags are cleared which is @@ -4553,8 +4774,6 @@ static int update_sched_domains(struct notifier_block *nfb, /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(); - sched_domain_debug(); - return NOTIFY_OK; } #endif @@ -4563,7 +4782,6 @@ void __init sched_init_smp(void) { lock_cpu_hotplug(); arch_init_sched_domains(); - sched_domain_debug(); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); @@ -4640,7 +4858,7 @@ void __might_sleep(char *file, int line) static unsigned long prev_jiffy; /* ratelimiting */ if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING) { + system_state == SYSTEM_RUNNING && !oops_in_progress) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; diff --git a/kernel/signal.c b/kernel/signal.c index cc50f2504365..d98e9624ea30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -153,11 +153,6 @@ static kmem_cache_t *sigqueue_cachep; (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) -#define sig_avoid_stop_race() \ - (sigtestsetmask(¤t->pending.signal, M(SIGCONT) | M(SIGKILL)) || \ - sigtestsetmask(¤t->signal->shared_pending.signal, \ - M(SIGCONT) | M(SIGKILL))) - static int sig_ignored(struct task_struct *t, int sig) { void __user * handler; @@ -380,8 +375,8 @@ void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -551,6 +546,21 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + if (signr && unlikely(sig_kernel_stop(signr))) { + /* + * Set a marker that we have dequeued a stop signal. Our + * caller might release the siglock and then the pending + * stop signal it is about to process is no longer in the + * pending bitmasks, but must still be cleared by a SIGCONT + * (and overruled by a SIGKILL). So those cases clear this + * shared flag after we've set it. Note that this flag may + * remain set after the signal we return is ignored or + * handled. That doesn't matter because its only purpose + * is to alert stop-signal processing code when another + * processor has come along and cleared the flag. + */ + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + } if ( signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ @@ -577,15 +587,15 @@ void signal_wake_up(struct task_struct *t, int resume) set_tsk_thread_flag(t, TIF_SIGPENDING); /* - * If resume is set, we want to wake it up in the TASK_STOPPED case. - * We don't check for TASK_STOPPED because there is a race with it + * For SIGKILL, we want to wake it up in the stopped/traced case. + * We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. - * By calling wake_up_process any time resume is set, we ensure - * the process will wake up and handle its stop or death signal. + * By using wake_up_state, we ensure the process will wake up and + * handle its death signal. */ mask = TASK_INTERRUPTIBLE; if (resume) - mask |= TASK_STOPPED; + mask |= TASK_STOPPED | TASK_TRACED; if (!wake_up_state(t, mask)) kick_process(t); } @@ -651,6 +661,12 @@ static void handle_stop_signal(int sig, struct task_struct *p) { struct task_struct *t; + if (p->flags & SIGNAL_GROUP_EXIT) + /* + * The process is in the middle of dying already. + */ + return; + if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. @@ -680,7 +696,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) * the SIGCHLD was pending on entry to this kill. */ p->signal->group_stop_count = 0; - p->signal->stop_state = 1; + p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); if (p->ptrace & PT_PTRACED) do_notify_parent_cldstop(p, p->parent, @@ -722,12 +738,12 @@ static void handle_stop_signal(int sig, struct task_struct *p) t = next_thread(t); } while (t != p); - if (p->signal->stop_state > 0) { + if (p->signal->flags & SIGNAL_STOP_STOPPED) { /* * We were in fact stopped, and are now continued. * Notify the parent with CLD_CONTINUED. */ - p->signal->stop_state = -1; + p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); if (p->ptrace & PT_PTRACED) @@ -739,7 +755,20 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->group_leader->real_parent, CLD_CONTINUED); spin_lock(&p->sighand->siglock); + } else { + /* + * We are not stopped, but there could be a stop + * signal in the middle of being processed after + * being removed from the queue. Clear that too. + */ + p->signal->flags = 0; } + } else if (sig == SIGKILL) { + /* + * Make sure that any pending stop signal already dequeued + * is undone by the wakeup for SIGKILL. + */ + p->signal->flags = 0; } } @@ -905,12 +934,12 @@ __group_complete_signal(int sig, struct task_struct *p) struct task_struct *t; /* - * Don't bother zombies and stopped tasks (but - * SIGKILL will punch through stopped state) + * Don't bother traced and stopped tasks (but + * SIGKILL will punch through that). */ - mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED; - if (sig != SIGKILL) - mask |= TASK_STOPPED; + mask = TASK_STOPPED | TASK_TRACED; + if (sig == SIGKILL) + mask = 0; /* * Now find a thread we can wake up to take the signal off the queue. @@ -953,7 +982,7 @@ __group_complete_signal(int sig, struct task_struct *p) * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ - if (sig_fatal(p, sig) && !p->signal->group_exit && + if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { /* @@ -966,7 +995,7 @@ __group_complete_signal(int sig, struct task_struct *p) * running and doing things after a slower * thread has the fatal signal pending. */ - p->signal->group_exit = 1; + p->signal->flags = SIGNAL_GROUP_EXIT; p->signal->group_exit_code = sig; p->signal->group_stop_count = 0; t = p; @@ -1055,6 +1084,7 @@ void zap_other_threads(struct task_struct *p) { struct task_struct *t; + p->signal->flags = SIGNAL_GROUP_EXIT; p->signal->group_stop_count = 0; if (thread_group_empty(p)) @@ -1064,7 +1094,7 @@ void zap_other_threads(struct task_struct *p) /* * Don't bother with already dead threads */ - if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD)) + if (t->exit_state) continue; /* @@ -1407,28 +1437,12 @@ out: } /* - * Joy. Or not. Pthread wants us to wake up every thread - * in our parent group. + * Wake up any threads in the parent blocked in wait* syscalls. */ -static void __wake_up_parent(struct task_struct *p, +static inline void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { - struct task_struct *tsk = parent; - - /* - * Fortunately this is not necessary for thread groups: - */ - if (p->tgid == tsk->tgid) { - wake_up_interruptible_sync(&tsk->wait_chldexit); - return; - } - - do { - wake_up_interruptible_sync(&tsk->wait_chldexit); - tsk = next_thread(tsk); - if (tsk->signal != parent->signal) - BUG(); - } while (tsk != parent); + wake_up_interruptible_sync(&parent->signal->wait_chldexit); } /* @@ -1442,8 +1456,7 @@ void do_notify_parent(struct task_struct *tsk, int sig) unsigned long flags; struct sighand_struct *psig; - if (sig == -1) - BUG(); + BUG_ON(sig == -1); /* do_notify_parent_cldstop should have been called instead. */ BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED)); @@ -1457,8 +1470,10 @@ void do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ - info.si_utime = tsk->utime + tsk->signal->utime; - info.si_stime = tsk->stime + tsk->signal->stime; + info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) @@ -1514,8 +1529,8 @@ do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, info.si_uid = tsk->uid; /* FIXME: find out whether or not this is supposed to be c*time. */ - info.si_utime = tsk->utime; - info.si_stime = tsk->stime; + info.si_utime = cputime_to_jiffies(tsk->utime); + info.si_stime = cputime_to_jiffies(tsk->stime); info.si_code = why; switch (why) { @@ -1551,11 +1566,12 @@ do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, * We always set current->last_siginfo while stopped here. * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. + * + * If we actually decide not to stop at all because the tracer is gone, + * we leave nostop_code in current->exit_code. */ -static void ptrace_stop(int exit_code, siginfo_t *info) +static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) { - BUG_ON(!(current->ptrace & PT_PTRACED)); - /* * If there is a group stop in progress, * we must participate in the bookkeeping. @@ -1570,9 +1586,24 @@ static void ptrace_stop(int exit_code, siginfo_t *info) set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, current->parent, CLD_TRAPPED); - read_unlock(&tasklist_lock); - schedule(); + if (likely(current->ptrace & PT_PTRACED) && + likely(current->parent != current->real_parent || + !(current->ptrace & PT_ATTACHED)) && + (likely(current->parent->signal != current->signal) || + !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { + do_notify_parent_cldstop(current, current->parent, + CLD_TRAPPED); + read_unlock(&tasklist_lock); + schedule(); + } else { + /* + * By the time we got the lock, our tracer went away. + * Don't stop here. + */ + read_unlock(&tasklist_lock); + set_current_state(TASK_RUNNING); + current->exit_code = nostop_code; + } /* * We are back. Now reacquire the siglock before touching @@ -1603,7 +1634,7 @@ void ptrace_notify(int exit_code) /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, &info); + ptrace_stop(exit_code, 0, &info); spin_unlock_irq(¤t->sighand->siglock); } @@ -1641,15 +1672,18 @@ finish_stop(int stop_count) /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. + * Returns nonzero if we've actually stopped and released the siglock. + * Returns zero if we didn't stop and still hold the siglock. */ -static void +static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; struct sighand_struct *sighand = current->sighand; int stop_count = -1; - /* spin_lock_irq(&sighand->siglock) is now done in caller */ + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) + return 0; if (sig->group_stop_count > 0) { /* @@ -1661,7 +1695,7 @@ do_signal_stop(int signr) current->exit_code = signr; set_current_state(TASK_STOPPED); if (stop_count == 0) - sig->stop_state = 1; + sig->flags = SIGNAL_STOP_STOPPED; spin_unlock_irq(&sighand->siglock); } else if (thread_group_empty(current)) { @@ -1670,7 +1704,7 @@ do_signal_stop(int signr) */ current->exit_code = current->signal->group_exit_code = signr; set_current_state(TASK_STOPPED); - sig->stop_state = 1; + sig->flags = SIGNAL_STOP_STOPPED; spin_unlock_irq(&sighand->siglock); } else { @@ -1691,25 +1725,16 @@ do_signal_stop(int signr) read_lock(&tasklist_lock); spin_lock_irq(&sighand->siglock); - if (unlikely(sig->group_exit)) { + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { /* - * There is a group exit in progress now. - * We'll just ignore the stop and process the - * associated fatal signal. + * Another stop or continue happened while we + * didn't have the lock. We can just swallow this + * signal now. If we raced with a SIGCONT, that + * should have just cleared it now. If we raced + * with another processor delivering a stop signal, + * then the SIGCONT that wakes us up should clear it. */ - spin_unlock_irq(&sighand->siglock); - read_unlock(&tasklist_lock); - return; - } - - if (unlikely(sig_avoid_stop_race())) { - /* - * Either a SIGCONT or a SIGKILL signal was - * posted in the siglock-not-held window. - */ - spin_unlock_irq(&sighand->siglock); - read_unlock(&tasklist_lock); - return; + return 0; } if (sig->group_stop_count == 0) { @@ -1737,13 +1762,14 @@ do_signal_stop(int signr) current->exit_code = signr; set_current_state(TASK_STOPPED); if (stop_count == 0) - sig->stop_state = 1; + sig->flags = SIGNAL_STOP_STOPPED; spin_unlock_irq(&sighand->siglock); read_unlock(&tasklist_lock); } finish_stop(stop_count); + return 1; } /* @@ -1765,7 +1791,7 @@ static inline int handle_group_stop(void) return 0; } - if (current->signal->group_exit) + if (current->signal->flags & SIGNAL_GROUP_EXIT) /* * Group stop is so another thread can do a core dump, * or else we are racing against a death signal. @@ -1779,7 +1805,7 @@ static inline int handle_group_stop(void) */ stop_count = --current->signal->group_stop_count; if (stop_count == 0) - current->signal->stop_state = 1; + current->signal->flags = SIGNAL_STOP_STOPPED; current->exit_code = current->signal->group_exit_code; set_current_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); @@ -1811,7 +1837,7 @@ relock: ptrace_signal_deliver(regs, cookie); /* Let the debugger run. */ - ptrace_stop(signr, info); + ptrace_stop(signr, signr, info); /* We're back. Did the debugger cancel the sig? */ signr = current->exit_code; @@ -1873,28 +1899,27 @@ relock: * This allows an intervening SIGCONT to be posted. * We need to check for that and bail out if necessary. */ - if (signr == SIGSTOP) { - do_signal_stop(signr); /* releases siglock */ - goto relock; - } - spin_unlock_irq(¤t->sighand->siglock); + if (signr != SIGSTOP) { + spin_unlock_irq(¤t->sighand->siglock); - /* signals can be posted during this window */ + /* signals can be posted during this window */ - if (is_orphaned_pgrp(process_group(current))) - goto relock; + if (is_orphaned_pgrp(process_group(current))) + goto relock; - spin_lock_irq(¤t->sighand->siglock); - if (unlikely(sig_avoid_stop_race())) { - /* - * Either a SIGCONT or a SIGKILL signal was - * posted in the siglock-not-held window. - */ - continue; + spin_lock_irq(¤t->sighand->siglock); } - do_signal_stop(signr); /* releases siglock */ - goto relock; + if (likely(do_signal_stop(signr))) { + /* It released the siglock. */ + goto relock; + } + + /* + * We didn't actually stop, due to a race + * with SIGCONT or something like that. + */ + continue; } spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/softirq.c b/kernel/softirq.c index 7572ca9ece74..582a1e8091bc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -142,7 +142,7 @@ void local_bh_enable(void) * Keep preemption disabled until we are done with * softirq processing: */ - preempt_count() -= SOFTIRQ_OFFSET - 1; + sub_preempt_count(SOFTIRQ_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); @@ -152,6 +152,24 @@ void local_bh_enable(void) } EXPORT_SYMBOL(local_bh_enable); +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED +# define invoke_softirq() __do_softirq() +#else +# define invoke_softirq() do_softirq() +#endif + +/* + * Exit an interrupt context. Process softirqs if needed and possible: + */ +void irq_exit(void) +{ + account_system_vtime(current); + sub_preempt_count(IRQ_EXIT_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); + preempt_enable_no_resched(); +} + /* * This function must run with irqs disabled! */ diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 476da1fd86f4..beacf8b7cee7 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -2,6 +2,8 @@ * Copyright (2004) Linus Torvalds * * Author: Zwane Mwaikambo <zwane@fsmlabs.com> + * + * Copyright (2004) Ingo Molnar */ #include <linux/config.h> @@ -11,6 +13,17 @@ #include <linux/interrupt.h> #include <linux/module.h> +/* + * Generic declaration of the raw read_trylock() function, + * architectures are supposed to optimize this: + */ +int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +{ + _raw_read_lock(lock); + return 1; +} +EXPORT_SYMBOL(generic_raw_read_trylock); + int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); @@ -22,86 +35,29 @@ int __lockfunc _spin_trylock(spinlock_t *lock) } EXPORT_SYMBOL(_spin_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc _read_trylock(rwlock_t *lock) { preempt_disable(); - if (_raw_write_trylock(lock)) + if (_raw_read_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); - -#ifdef CONFIG_PREEMPT -/* - * This could be a long-held lock. If another CPU holds it for a long time, - * and that CPU is not asked to reschedule then *this* CPU will spin on the - * lock for a long time, even if *this* CPU is asked to reschedule. - * - * So what we do here, in the slow (contended) path is to spin on the lock by - * hand while permitting preemption. - * - * Called inside preempt_disable(). - */ -static inline void __preempt_spin_lock(spinlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_spin_lock(lock); - return; - } - - do { - preempt_enable(); - while (spin_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_spin_trylock(lock)); -} +EXPORT_SYMBOL(_read_trylock); -void __lockfunc _spin_lock(spinlock_t *lock) +int __lockfunc _write_trylock(rwlock_t *lock) { preempt_disable(); - if (unlikely(!_raw_spin_trylock(lock))) - __preempt_spin_lock(lock); -} - -static inline void __preempt_write_lock(rwlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_write_lock(lock); - return; - } - - do { - preempt_enable(); - while (rwlock_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_write_trylock(lock)); -} + if (_raw_write_trylock(lock)) + return 1; -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - if (unlikely(!_raw_write_trylock(lock))) - __preempt_write_lock(lock); -} -#else -void __lockfunc _spin_lock(spinlock_t *lock) -{ - preempt_disable(); - _raw_spin_lock(lock); + preempt_enable(); + return 0; } +EXPORT_SYMBOL(_write_trylock); -void __lockfunc _write_lock(rwlock_t *lock) -{ - preempt_disable(); - _raw_write_lock(lock); -} -#endif -EXPORT_SYMBOL(_spin_lock); -EXPORT_SYMBOL(_write_lock); +#ifndef CONFIG_PREEMPT void __lockfunc _read_lock(rwlock_t *lock) { @@ -110,27 +66,6 @@ void __lockfunc _read_lock(rwlock_t *lock) } EXPORT_SYMBOL(_read_lock); -void __lockfunc _spin_unlock(spinlock_t *lock) -{ - _raw_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_spin_unlock); - -void __lockfunc _write_unlock(rwlock_t *lock) -{ - _raw_write_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_write_unlock); - -void __lockfunc _read_unlock(rwlock_t *lock) -{ - _raw_read_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(_read_unlock); - unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) { unsigned long flags; @@ -212,6 +147,132 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) } EXPORT_SYMBOL(_write_lock_bh); +void __lockfunc _spin_lock(spinlock_t *lock) +{ + preempt_disable(); + _raw_spin_lock(lock); +} + +EXPORT_SYMBOL(_spin_lock); + +void __lockfunc _write_lock(rwlock_t *lock) +{ + preempt_disable(); + _raw_write_lock(lock); +} + +EXPORT_SYMBOL(_write_lock); + +#else /* CONFIG_PREEMPT: */ + +/* + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + * + * (We do this in a function because inlining it would be excessive.) + */ + +#define BUILD_LOCK_OPS(op, locktype, is_locked_fn) \ +void __lockfunc _##op##_lock(locktype *lock) \ +{ \ + preempt_disable(); \ + for (;;) { \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (is_locked_fn(lock) && (lock)->break_lock) \ + cpu_relax(); \ + preempt_disable(); \ + } \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock); \ + \ +unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock) \ +{ \ + unsigned long flags; \ + \ + preempt_disable(); \ + for (;;) { \ + local_irq_save(flags); \ + if (likely(_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + \ + preempt_enable(); \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (is_locked_fn(lock) && (lock)->break_lock) \ + cpu_relax(); \ + preempt_disable(); \ + } \ + return flags; \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irqsave); \ + \ +void __lockfunc _##op##_lock_irq(locktype *lock) \ +{ \ + _##op##_lock_irqsave(lock); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_irq); \ + \ +void __lockfunc _##op##_lock_bh(locktype *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + \ +EXPORT_SYMBOL(_##op##_lock_bh) + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * _[spin|read|write]_lock() + * _[spin|read|write]_lock_irq() + * _[spin|read|write]_lock_irqsave() + * _[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, spinlock_t, spin_is_locked); +BUILD_LOCK_OPS(read, rwlock_t, rwlock_is_locked); +BUILD_LOCK_OPS(write, rwlock_t, spin_is_locked); + +#endif /* CONFIG_PREEMPT */ + +void __lockfunc _spin_unlock(spinlock_t *lock) +{ + _raw_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_spin_unlock); + +void __lockfunc _write_unlock(rwlock_t *lock) +{ + _raw_write_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_write_unlock); + +void __lockfunc _read_unlock(rwlock_t *lock) +{ + _raw_read_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(_read_unlock); + void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { _raw_spin_unlock(lock); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2ceea25f67f6..e31b1cb8e503 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -95,7 +95,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == smp_processor_id()) + if (i == _smp_processor_id()) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) @@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) - cpu = smp_processor_id(); + cpu = _smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { diff --git a/kernel/sys.c b/kernel/sys.c index fdc29f17ac93..6e354fd380e7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -23,6 +23,7 @@ #include <linux/security.h> #include <linux/dcookies.h> #include <linux/suspend.h> +#include <linux/tty.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -88,7 +89,7 @@ int cad_pid = 1; */ static struct notifier_block *reboot_notifier_list; -rwlock_t notifier_lock = RW_LOCK_UNLOCKED; +DEFINE_RWLOCK(notifier_lock); /** * notifier_chain_register - Add notifier to a notifier chain @@ -892,15 +893,15 @@ asmlinkage long sys_times(struct tms __user * tbuf) struct tms tmp; struct task_struct *tsk = current; struct task_struct *t; - unsigned long utime, stime, cutime, cstime; + cputime_t utime, stime, cutime, cstime; read_lock(&tasklist_lock); utime = tsk->signal->utime; stime = tsk->signal->stime; t = tsk; do { - utime += t->utime; - stime += t->stime; + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); t = next_thread(t); } while (t != tsk); @@ -919,10 +920,10 @@ asmlinkage long sys_times(struct tms __user * tbuf) spin_unlock_irq(&tsk->sighand->siglock); read_unlock(&tasklist_lock); - tmp.tms_utime = jiffies_to_clock_t(utime); - tmp.tms_stime = jiffies_to_clock_t(stime); - tmp.tms_cutime = jiffies_to_clock_t(cutime); - tmp.tms_cstime = jiffies_to_clock_t(cstime); + tmp.tms_utime = cputime_to_clock_t(utime); + tmp.tms_stime = cputime_to_clock_t(stime); + tmp.tms_cutime = cputime_to_clock_t(cutime); + tmp.tms_cstime = cputime_to_clock_t(cstime); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1075,6 +1076,7 @@ asmlinkage long sys_setsid(void) if (!thread_group_leader(current)) return -EINVAL; + down(&tty_sem); write_lock_irq(&tasklist_lock); pid = find_pid(PIDTYPE_PGID, current->pid); @@ -1088,6 +1090,7 @@ asmlinkage long sys_setsid(void) err = process_group(current); out: write_unlock_irq(&tasklist_lock); + up(&tty_sem); return err; } @@ -1525,7 +1528,7 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - unsigned long utime, stime; + cputime_t utime, stime; memset((char *) r, 0, sizeof *r); @@ -1542,12 +1545,12 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; spin_unlock_irqrestore(&p->sighand->siglock, flags); - jiffies_to_timeval(utime, &r->ru_utime); - jiffies_to_timeval(stime, &r->ru_stime); + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); break; case RUSAGE_SELF: spin_lock_irqsave(&p->sighand->siglock, flags); - utime = stime = 0; + utime = stime = cputime_zero; goto sum_group; case RUSAGE_BOTH: spin_lock_irqsave(&p->sighand->siglock, flags); @@ -1558,16 +1561,16 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; sum_group: - utime += p->signal->utime; - stime += p->signal->stime; + utime = cputime_add(utime, p->signal->utime); + stime = cputime_add(stime, p->signal->stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; r->ru_majflt += p->signal->maj_flt; t = p; do { - utime += t->utime; - stime += t->stime; + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@ -1575,8 +1578,8 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r) t = next_thread(t); } while (t != p); spin_unlock_irqrestore(&p->sighand->siglock, flags); - jiffies_to_timeval(utime, &r->ru_utime); - jiffies_to_timeval(stime, &r->ru_stime); + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); break; default: BUG(); @@ -1689,6 +1692,15 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, set_task_comm(me, ncomm); return 0; } + case PR_GET_NAME: { + struct task_struct *me = current; + unsigned char tcomm[sizeof(me->comm)]; + + get_task_comm(tcomm, me); + if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) + return -EFAULT; + return 0; + } default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 568b4579ef24..85503726d60b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -52,7 +52,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int panic_timeout; extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; @@ -765,6 +764,7 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, +#ifdef CONFIG_MMU { .ctl_name = VM_MAX_MAP_COUNT, .procname = "max_map_count", @@ -773,6 +773,7 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#endif { .ctl_name = VM_LAPTOP_MODE, .procname = "laptop_mode", @@ -914,6 +915,7 @@ static ctl_table fs_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_MMU { .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", @@ -938,6 +940,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif { .ctl_name = 0 } }; diff --git a/kernel/time.c b/kernel/time.c index b6d01cf709c4..d5400f6af052 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -33,6 +33,7 @@ #include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/security.h> +#include <linux/fs.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -52,12 +53,10 @@ EXPORT_SYMBOL(sys_tz); * sys_gettimeofday(). Is this for backwards compatibility? If so, * why not move it into the appropriate arch directory (for those * architectures that need it). - * - * XXX This function is NOT 64-bit clean! */ -asmlinkage long sys_time(int __user * tloc) +asmlinkage long sys_time(time_t __user * tloc) { - int i; + time_t i; struct timeval tv; do_gettimeofday(&tv); @@ -417,7 +416,7 @@ asmlinkage long sys_adjtimex(struct timex __user *txc_p) return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } -struct timespec current_kernel_time(void) +inline struct timespec current_kernel_time(void) { struct timespec now; unsigned long seq; @@ -433,6 +432,50 @@ struct timespec current_kernel_time(void) EXPORT_SYMBOL(current_kernel_time); +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granuality supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/** + * timespec_trunc - Truncate timespec to a granuality + * @t: Timespec + * @gran: Granuality in ns. + * + * Truncate a timespec to a granuality. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the later. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + #ifdef CONFIG_TIME_INTERPOLATION void getnstimeofday (struct timespec *tv) { diff --git a/kernel/timer.c b/kernel/timer.c index d0eed9b563c4..6bb47b0e4983 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -465,7 +465,14 @@ repeat: smp_wmb(); timer->base = NULL; spin_unlock_irq(&base->lock); - fn(data); + { + u32 preempt_count = preempt_count(); + fn(data); + if (preempt_count != preempt_count()) { + printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); + BUG(); + } + } spin_lock_irq(&base->lock); goto repeat; } @@ -554,7 +561,7 @@ unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ /* * The current time * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged at zero + * for sub jiffie times) to get to monotonic time. Monotonic is pegged * at zero at system boot time, so wall_to_monotonic will be negative, * however, we will ALWAYS keep the tv_nsec part positive so we can use * the usual normalization. @@ -799,59 +806,6 @@ static void update_wall_time(unsigned long ticks) } while (ticks); } -static inline void do_process_times(struct task_struct *p, - unsigned long user, unsigned long system) -{ - unsigned long psecs; - - psecs = (p->utime += user); - psecs += (p->stime += system); - if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) && - psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) { - /* Send SIGXCPU every second.. */ - if (!(psecs % HZ)) - send_sig(SIGXCPU, p, 1); - /* and SIGKILL when we go over max.. */ - if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max) - send_sig(SIGKILL, p, 1); - } -} - -static inline void do_it_virt(struct task_struct * p, unsigned long ticks) -{ - unsigned long it_virt = p->it_virt_value; - - if (it_virt) { - it_virt -= ticks; - if (!it_virt) { - it_virt = p->it_virt_incr; - send_sig(SIGVTALRM, p, 1); - } - p->it_virt_value = it_virt; - } -} - -static inline void do_it_prof(struct task_struct *p) -{ - unsigned long it_prof = p->it_prof_value; - - if (it_prof) { - if (--it_prof == 0) { - it_prof = p->it_prof_incr; - send_sig(SIGPROF, p, 1); - } - p->it_prof_value = it_prof; - } -} - -static void update_one_process(struct task_struct *p, unsigned long user, - unsigned long system, int cpu) -{ - do_process_times(p, user, system); - do_it_virt(p, user); - do_it_prof(p); -} - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. @@ -859,11 +813,17 @@ static void update_one_process(struct task_struct *p, unsigned long user, void update_process_times(int user_tick) { struct task_struct *p = current; - int cpu = smp_processor_id(), system = user_tick ^ 1; + int cpu = smp_processor_id(); - update_one_process(p, user_tick, system, cpu); + /* Note: this timer irq context must be accounted for as well. */ + if (user_tick) + account_user_time(p, jiffies_to_cputime(1)); + else + account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); run_local_timers(); - scheduler_tick(user_tick, system); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_tick); + scheduler_tick(); } /* @@ -1438,7 +1398,7 @@ void __init init_timers(void) struct time_interpolator *time_interpolator; static struct time_interpolator *time_interpolator_list; -static spinlock_t time_interpolator_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(time_interpolator_lock); static inline u64 time_interpolator_get_cycles(unsigned int src) { diff --git a/kernel/user.c b/kernel/user.c index 693487dc940e..18f63146602a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,7 +26,7 @@ static kmem_cache_t *uid_cachep; static struct list_head uidhash_table[UIDHASH_SZ]; -static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { .__count = ATOMIC_INIT(1), diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee77ccd01d04..3993f7bdf5c2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -8,7 +8,7 @@ * * Derived from the taskqueue/keventd code by: * - * David Woodhouse <dwmw2@redhat.com> + * David Woodhouse <dwmw2@infradead.org> * Andrew Morton <andrewm@uow.edu.au> * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> @@ -64,7 +64,7 @@ struct workqueue_struct { /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove threads to each one as cpus come/go. */ -static spinlock_t workqueue_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); /* If it's single threaded, it isn't in the list of workqueues. */ @@ -188,7 +188,7 @@ static int worker_thread(void *__cwq) current->flags |= PF_NOFREEZE; - set_user_nice(current, -10); + set_user_nice(current, -5); /* Block and flush all signals */ sigfillset(&blocked); |
