diff options
Diffstat (limited to 'kernel')
86 files changed, 6740 insertions, 2855 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 248e1c396f8b..4af15802ccd4 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -7,7 +7,7 @@ choice default HZ_250 help Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 HZ but 100 HZ may be more + to have the timer interrupt run at 1000 Hz but 100 Hz may be more beneficial for servers and NUMA systems that do not need to have a fast response for user interaction and that may experience bus contention and cacheline bounces as a result of timer interrupts. @@ -19,21 +19,30 @@ choice config HZ_100 bool "100 HZ" help - 100 HZ is a typical choice for servers, SMP and NUMA systems + 100 Hz is a typical choice for servers, SMP and NUMA systems with lots of processors that may show reduced performance if too many timer interrupts are occurring. config HZ_250 bool "250 HZ" help - 250 HZ is a good compromise choice allowing server performance + 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even - on SMP and NUMA systems. + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + + config HZ_300 + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. config HZ_1000 bool "1000 HZ" help - 1000 HZ is the preferred choice for desktop systems and other + 1000 Hz is the preferred choice for desktop systems and other systems requiring fast interactive responses to events. endchoice @@ -42,5 +51,6 @@ config HZ int default 100 if HZ_100 default 250 if HZ_250 + default 300 if HZ_300 default 1000 if HZ_1000 diff --git a/kernel/Makefile b/kernel/Makefile index d62ec66c1af2..5e3f3b75563a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -48,8 +48,9 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o -obj-$(CONFIG_TASKSTATS) += taskstats.o +obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index 2a7c933651c7..70d0d88e5554 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -89,7 +89,8 @@ struct acct_glbs { struct timer_list timer; }; -static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; +static struct acct_glbs acct_globals __cacheline_aligned = + {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; /* * Called whenever the timer says to check the free space. @@ -117,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry, &sbuf)) + if (vfs_statfs(file->f_path.dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file) add_timer(&acct_globals.timer); } if (old_acct) { - mnt_unpin(old_acct->f_vfsmnt); + mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); do_acct_process(old_acct); filp_close(old_acct, NULL); @@ -211,7 +212,7 @@ static int acct_on(char *name) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -228,11 +229,11 @@ static int acct_on(char *name) } spin_lock(&acct_globals.lock); - mnt_pin(file->f_vfsmnt); + mnt_pin(file->f_path.mnt); acct_file_reopen(file); spin_unlock(&acct_globals.lock); - mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ + mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ return 0; } @@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) void acct_auto_close_mnt(struct vfsmount *m) { spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_vfsmnt == m) + if (acct_globals.file && acct_globals.file->f_path.mnt == m) acct_file_reopen(NULL); spin_unlock(&acct_globals.lock); } @@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb) { spin_lock(&acct_globals.lock); if (acct_globals.file && - acct_globals.file->f_vfsmnt->mnt_sb == sb) { + acct_globals.file->f_path.mnt->mnt_sb == sb) { acct_file_reopen(NULL); } spin_unlock(&acct_globals.lock); @@ -427,6 +428,7 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; /* * First check to see if there is enough free_space to continue @@ -483,12 +485,9 @@ static void do_acct_process(struct file *file) ac.ac_ppid = current->parent->tgid; #endif - read_lock(&tasklist_lock); /* pin current->signal */ - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; @@ -598,33 +597,3 @@ void acct_process(void) do_acct_process(file); fput(file); } - - -/** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting - */ -void acct_update_integrals(struct task_struct *tsk) -{ - if (likely(tsk->mm)) { - long delta = - cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; - - if (delta == 0) - return; - tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - } -} - -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; -} diff --git a/kernel/audit.c b/kernel/audit.c index f9889ee77825..d9b690ac684b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -57,6 +57,7 @@ #include <linux/netlink.h> #include <linux/selinux.h> #include <linux/inotify.h> +#include <linux/freezer.h> #include "audit.h" @@ -340,7 +341,7 @@ static int kauditd_thread(void *dummy) { struct sk_buff *skb; - while (1) { + while (!kthread_should_stop()) { skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); if (skb) { @@ -369,6 +370,7 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } + return 0; } int audit_send_list(void *_dest) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 1a58a81fb09d..2e896f8ae29e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -411,7 +411,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_FSGID: case AUDIT_LOGINUID: case AUDIT_PERS: - case AUDIT_ARCH: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: @@ -423,6 +422,14 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_ARG2: case AUDIT_ARG3: break; + /* arch is only allowed to be = or != */ + case AUDIT_ARCH: + if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) + && (f->op != AUDIT_NEGATE) && (f->op)) { + err = -EINVAL; + goto exit_free; + } + break; case AUDIT_PERM: if (f->val & ~15) goto exit_free; @@ -629,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) struct audit_rule *rule; int i; - rule = kmalloc(sizeof(*rule), GFP_KERNEL); + rule = kzalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) return NULL; - memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; rule->action = krule->action; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fb83c5cb8c32..298897559ca4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@ #include <linux/tty.h> #include <linux/selinux.h> #include <linux/binfmts.h> +#include <linux/highmem.h> #include <linux/syscalls.h> #include "audit.h" @@ -278,8 +279,11 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(tsk->pid, f->op, f->val); break; case AUDIT_PPID: - if (ctx) + if (ctx) { + if (!ctx->ppid) + ctx->ppid = sys_getppid(); result = audit_comparator(ctx->ppid, f->op, f->val); + } break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); @@ -727,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context) printk(KERN_ERR "audit: freed %d contexts\n", count); } -static void audit_log_task_context(struct audit_buffer *ab) +void audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; ssize_t len = 0; @@ -756,6 +760,8 @@ error_path: return; } +EXPORT_SYMBOL(audit_log_task_context); + static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { char name[sizeof(tsk->comm)]; @@ -775,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_dentry, - vma->vm_file->f_vfsmnt); + vma->vm_file->f_path.dentry, + vma->vm_file->f_path.mnt); break; } vma = vma->vm_next; @@ -795,7 +801,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts /* tsk == current */ context->pid = tsk->pid; - context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ + if (!context->ppid) + context->ppid = sys_getppid(); context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -817,10 +824,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); + + mutex_lock(&tty_mutex); + read_lock(&tasklist_lock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; + read_unlock(&tasklist_lock); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" @@ -838,6 +849,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->gid, context->euid, context->suid, context->fsuid, context->egid, context->sgid, context->fsgid, tty); + + mutex_unlock(&tty_mutex); + audit_log_task_info(ab, tsk); if (context->filterkey) { audit_log_format(ab, " key="); @@ -1132,6 +1146,7 @@ void audit_syscall_entry(int arch, int major, context->ctime = CURRENT_TIME; context->in_syscall = 1; context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->ppid = 0; } /** @@ -1347,7 +1362,13 @@ void __audit_inode_child(const char *dname, const struct inode *inode, } update_context: - idx = context->name_count++; + idx = context->name_count; + if (context->name_count == AUDIT_NAMES) { + printk(KERN_DEBUG "name_count maxed and losing %s\n", + found_name ?: "(null)"); + return; + } + context->name_count++; #if AUDIT_DEBUG context->ino_count++; #endif @@ -1365,7 +1386,16 @@ update_context: /* A parent was not found in audit_names, so copy the inode data for the * provided parent. */ if (!found_name) { - idx = context->name_count++; + idx = context->name_count; + if (context->name_count == AUDIT_NAMES) { + printk(KERN_DEBUG + "name_count maxed and losing parent inode data: dev=%02x:%02x, inode=%lu", + MAJOR(parent->i_sb->s_dev), + MINOR(parent->i_sb->s_dev), + parent->i_ino); + return; + } + context->name_count++; #if AUDIT_DEBUG context->ino_count++; #endif @@ -1462,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } +EXPORT_SYMBOL(audit_get_loginuid); + /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag diff --git a/kernel/capability.c b/kernel/capability.c index c7685ad00a97..edb845a6e84a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective, int found = 0; do_each_thread(g, target) { - if (target == current || target->pid == 1) + if (target == current || is_init(target)) continue; found = 1; if (security_capset_check(target, effective, inheritable, diff --git a/kernel/compat.c b/kernel/compat.c index 126dee9530aa..6952dd057300 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -22,6 +22,7 @@ #include <linux/security.h> #include <linux/timex.h> #include <linux/migrate.h> +#include <linux/posix-timers.h> #include <asm/uaccess.h> @@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock, return err; } +static long compat_clock_nanosleep_restart(struct restart_block *restart) +{ + long err; + mm_segment_t oldfs; + struct timespec tu; + struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); + + restart->arg1 = (unsigned long) &tu; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = clock_nanosleep_restart(restart); + set_fs(oldfs); + + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && + put_compat_timespec(&tu, rmtp)) + return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart->fn = compat_clock_nanosleep_restart; + restart->arg1 = (unsigned long) rmtp; + } + return err; +} + long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, struct compat_timespec __user *rqtp, struct compat_timespec __user *rmtp) @@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, long err; mm_segment_t oldfs; struct timespec in, out; + struct restart_block *restart; if (get_compat_timespec(&in, rqtp)) return -EFAULT; @@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, (struct timespec __user *) &in, (struct timespec __user *) &out); set_fs(oldfs); + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && put_compat_timespec(&out, rmtp)) return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart = ¤t_thread_info()->restart_block; + restart->fn = compat_clock_nanosleep_restart; + restart->arg1 = (unsigned long) rmtp; + } return err; } @@ -645,7 +678,7 @@ int get_compat_sigevent(struct sigevent *event, ? -EFAULT : 0; } -long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { int i, j; @@ -949,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, } return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} #endif diff --git a/kernel/configs.c b/kernel/configs.c index f9e31974f4ad..8fa1fb28f8a7 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf, return count; } -static struct file_operations ikconfig_file_ops = { +static const struct file_operations ikconfig_file_ops = { .owner = THIS_MODULE, .read = ikconfig_read_current, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 32c96628463e..9124669f4586 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,7 +19,7 @@ static DEFINE_MUTEX(cpu_add_remove_lock); static DEFINE_MUTEX(cpu_bitmask_lock); -static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock @@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void) recursive_depth--; return; } - mutex_unlock(&cpu_bitmask_lock); recursive = NULL; + mutex_unlock(&cpu_bitmask_lock); } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); @@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); /* Need to know about CPUs going up/down? */ int __cpuinit register_cpu_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_register(&cpu_chain, nb); + int ret; + mutex_lock(&cpu_add_remove_lock); + ret = raw_notifier_chain_register(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); + return ret; } #ifdef CONFIG_HOTPLUG_CPU @@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - blocking_notifier_chain_unregister(&cpu_chain, nb); + mutex_lock(&cpu_add_remove_lock); + raw_notifier_chain_unregister(&cpu_chain, nb); + mutex_unlock(&cpu_add_remove_lock); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu) if (!cpu_online(cpu)) return -EINVAL; - err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", @@ -144,18 +150,18 @@ static int _cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); mutex_unlock(&cpu_bitmask_lock); - if (IS_ERR(p)) { + if (IS_ERR(p) || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); - err = PTR_ERR(p); - goto out_allowed; - } - - if (cpu_online(cpu)) + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto out_allowed; + } goto out_thread; + } /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) @@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu) put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, + if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned int cpu) if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; - ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) - blocking_notifier_call_chain(&cpu_chain, + raw_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); return ret; @@ -264,11 +270,7 @@ int disable_nonboot_cpus(void) goto out; } } - error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); - if (error) { - printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); - goto out; - } + /* We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cff41511269f..2c3b4431472b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -240,7 +240,7 @@ static struct super_block *cpuset_sb; * A cpuset can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cpusets is empty. Since all * tasks in the system use _some_ cpuset, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cpuset + * least one task in the system (init), therefore, top_cpuset * always has either children cpusets and/or using tasks. So we don't * need a special hack to ensure that top_cpuset cannot be deleted. * @@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; @@ -378,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directories start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else { return -ENOMEM; } @@ -414,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { * * * When reading/writing to a file: - * - the cpuset to use in file->f_dentry->d_parent->d_fsdata - * - the 'cftype' of the file is file->f_dentry->d_fsdata + * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { @@ -730,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* Remaining checks don't apply to root cpuset */ - if ((par = cur->parent) == NULL) + if (cur == &top_cpuset) return 0; + par = cur->parent; + /* We must be a subset of our parent cpuset */ if (!is_cpuset_subset(trial, par)) return -EACCES; @@ -913,6 +914,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) int fudge; int retval; + /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + trialcs = *cs; retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) @@ -1057,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); mutex_lock(&callback_mutex); - if (turning_on) - set_bit(bit, &cs->flags); - else - clear_bit(bit, &cs->flags); + cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) @@ -1222,7 +1224,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) task_lock(tsk); oldcs = tsk->cpuset; - if (!oldcs) { + /* + * After getting 'oldcs' cpuset ptr, be sure still not exiting. + * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack + * then fail this attach_task(), to avoid breaking top_cpuset.count. + */ + if (tsk->flags & PF_EXITING) { task_unlock(tsk); mutex_unlock(&callback_mutex); put_task_struct(tsk); @@ -1273,18 +1280,19 @@ typedef enum { FILE_TASKLIST, } cpuset_filetype_t; -static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, +static ssize_t cpuset_common_file_write(struct file *file, + const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); - struct cftype *cft = __d_cft(file->f_dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); cpuset_filetype_t type = cft->private; char *buffer; char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ - if (nbytes > 100 + 6 * NR_CPUS) + if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) return -E2BIG; /* +1 for nul-terminator */ @@ -1359,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1409,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct cftype *cft = __d_cft(file->f_dentry); - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cftype *cft = __d_cft(file->f_path.dentry); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1468,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt loff_t *ppos) { ssize_t retval = 0; - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; @@ -1490,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) if (err) return err; - cft = __d_cft(file->f_dentry); + cft = __d_cft(file->f_path.dentry); if (!cft) return -ENODEV; if (cft->open) @@ -1503,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) static int cpuset_file_release(struct inode *inode, struct file *file) { - struct cftype *cft = __d_cft(file->f_dentry); + struct cftype *cft = __d_cft(file->f_path.dentry); if (cft->release) return cft->release(inode, file); return 0; @@ -1524,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cpuset_file_operations = { +static const struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, .llseek = generic_file_llseek, @@ -1557,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) inode->i_fop = &simple_dir_operations; /* start off with i_nlink == 2 (for "." entry) */ - inode->i_nlink++; + inc_nlink(inode); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cpuset_file_operations; @@ -1590,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode) error = cpuset_create_file(dentry, S_IFDIR | mode); if (!error) { dentry->d_fsdata = cs; - parent->d_inode->i_nlink++; + inc_nlink(parent->d_inode); cs->dentry = dentry; } dput(dentry); @@ -1692,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { - struct cpuset *cs = __d_cs(file->f_dentry->d_parent); + struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); struct ctr_struct *ctr; pid_t *pidarray; int npids; @@ -2025,7 +2033,7 @@ int __init cpuset_init(void) } root = cpuset_mount->mnt_sb->s_root; root->d_fsdata = &top_cpuset; - root->d_inode->i_nlink++; + inc_nlink(root->d_inode); top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; number_of_cpusets = 1; @@ -2038,30 +2046,97 @@ out: } /* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * This handles CPU hotplug (cpuhp) events. If someday Memory - * Nodes can be hotplugged (dynamically changing node_online_map) - * then we should handle that too, perhaps in a similar way. + * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then the guarantee_online_cpus() + * or guarantee_online_mems() code will use that emptied cpusets + * parent online CPUs or nodes. Cpusets that were already empty of + * CPUs or nodes are left empty. + * + * This routine is intentionally inefficient in a couple of regards. + * It will check all cpusets in a subtree even if the top cpuset of + * the subtree has no offline CPUs or nodes. It checks both CPUs and + * nodes, even though the caller could have been coded to know that + * only one of CPUs or nodes needed to be checked on a given call. + * This was done to minimize text size rather than cpu cycles. + * + * Call with both manage_mutex and callback_mutex held. + * + * Recursive, on depth of cpuset subtree. */ -#ifdef CONFIG_HOTPLUG_CPU -static int cpuset_handle_cpuhp(struct notifier_block *nb, - unsigned long phase, void *cpu) +static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) +{ + struct cpuset *c; + + /* Each of our child cpusets mems must be online */ + list_for_each_entry(c, &cur->children, sibling) { + guarantee_online_cpus_mems_in_subtree(c); + if (!cpus_empty(c->cpus_allowed)) + guarantee_online_cpus(c, &c->cpus_allowed); + if (!nodes_empty(c->mems_allowed)) + guarantee_online_mems(c, &c->mems_allowed); + } +} + +/* + * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track + * cpu_online_map and node_online_map. Force the top cpuset to track + * whats online after any CPU or memory node hotplug or unplug event. + * + * To ensure that we don't remove a CPU or node from the top cpuset + * that is currently in use by a child cpuset (which would violate + * the rule that cpusets must be subsets of their parent), we first + * call the recursive routine guarantee_online_cpus_mems_in_subtree(). + * + * Since there are two callers of this routine, one for CPU hotplug + * events and one for memory node hotplug events, we could have coded + * two separate routines here. We code it as a single common routine + * in order to minimize text size. + */ + +static void common_cpu_mem_hotplug_unplug(void) { mutex_lock(&manage_mutex); mutex_lock(&callback_mutex); + guarantee_online_cpus_mems_in_subtree(&top_cpuset); top_cpuset.cpus_allowed = cpu_online_map; + top_cpuset.mems_allowed = node_online_map; mutex_unlock(&callback_mutex); mutex_unlock(&manage_mutex); +} +/* + * The top_cpuset tracks what CPUs and Memory Nodes are online, + * period. This is necessary in order to make cpusets transparent + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * + * This routine ensures that top_cpuset.cpus_allowed tracks + * cpu_online_map on each CPU hotplug (cpuhp) event. + */ + +static int cpuset_handle_cpuhp(struct notifier_block *nb, + unsigned long phase, void *cpu) +{ + common_cpu_mem_hotplug_unplug(); return 0; } + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Keep top_cpuset.mems_allowed tracking node_online_map. + * Call this routine anytime after you change node_online_map. + * See also the previous routine cpuset_handle_cpuhp(). + */ + +void cpuset_track_online_nodes(void) +{ + common_cpu_mem_hotplug_unplug(); +} #endif /** @@ -2531,7 +2606,7 @@ static int cpuset_open(struct inode *inode, struct file *file) return single_open(file, proc_cpuset_show, pid); } -struct file_operations proc_cpuset_operations = { +const struct file_operations proc_cpuset_operations = { .open = cpuset_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 36752f124c6a..766d5912b26a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -20,7 +20,7 @@ #include <linux/delayacct.h> int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ -kmem_cache_t *delayacct_cache; +struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) { @@ -41,7 +41,7 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { - tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); } @@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end, { struct timespec ts; s64 ns; + unsigned long flags; do_posix_clock_monotonic_gettime(end); ts = timespec_sub(*end, *start); @@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end, if (ns < 0) return; - spin_lock(¤t->delays->lock); + spin_lock_irqsave(¤t->delays->lock, flags); *total += ns; (*count)++; - spin_unlock(¤t->delays->lock); + spin_unlock_irqrestore(¤t->delays->lock, flags); } void __delayacct_blkio_start(void) @@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) s64 tmp; struct timespec ts; unsigned long t1,t2,t3; + unsigned long flags; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - spin_lock(&tsk->delays->lock); + spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; - spin_unlock(&tsk->delays->lock); + spin_unlock_irqrestore(&tsk->delays->lock, flags); done: return 0; @@ -152,11 +154,12 @@ done: __u64 __delayacct_blkio_ticks(struct task_struct *tsk) { __u64 ret; + unsigned long flags; - spin_lock(&tsk->delays->lock); + spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay + tsk->delays->swapin_delay); - spin_unlock(&tsk->delays->lock); + spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } diff --git a/kernel/dma.c b/kernel/dma.c index aef0a45b7893..937b13ca33ba 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { }; +/** + * request_dma - request and reserve a system DMA channel + * @dmanr: DMA channel number + * @device_id: reserving device ID string, used in /proc/dma + */ int request_dma(unsigned int dmanr, const char * device_id) { if (dmanr >= MAX_DMA_CHANNELS) @@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id) return 0; } /* request_dma */ - +/** + * free_dma - free a reserved system DMA channel + * @dmanr: DMA channel number + */ void free_dma(unsigned int dmanr) { if (dmanr >= MAX_DMA_CHANNELS) { @@ -132,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file) return single_open(file, proc_dma_show, NULL); } -static struct file_operations proc_dma_operations = { +static const struct file_operations proc_dma_operations = { .open = proc_dma_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/exit.c b/kernel/exit.c index d891883420f7..122fadb972fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,13 +13,16 @@ #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/key.h> #include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/binfmts.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -38,6 +41,7 @@ #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> +#include <linux/blkdev.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -45,7 +49,6 @@ #include <asm/mmu_context.h> extern void sem_exit (void); -extern struct task_struct *child_reaper; static void exit_mm(struct task_struct * tsk); @@ -125,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); + taskstats_tgid_free(sig); __cleanup_signal(sig); } } @@ -185,21 +189,18 @@ repeat: int session_of_pgrp(int pgrp) { struct task_struct *p; - int sid = -1; + int sid = 0; read_lock(&tasklist_lock); - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (p->signal->session > 0) { - sid = p->signal->session; - goto out; - } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); - p = find_task_by_pid(pgrp); - if (p) - sid = p->signal->session; -out: + + p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + if (p == NULL) + p = find_task_by_pid(pgrp); + if (p != NULL) + sid = process_session(p); + read_unlock(&tasklist_lock); - + return sid; } @@ -219,10 +220,10 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state - || p->real_parent->pid == 1) + || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp - && p->real_parent->signal->session == p->signal->session) { + if (process_group(p->real_parent) != pgrp && + process_session(p->real_parent) == process_session(p)) { ret = 0; break; } @@ -249,17 +250,6 @@ static int has_stopped_jobs(int pgrp) do_each_task_pid(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; - - /* If p is stopped by a debugger on a signal that won't - stop it, then don't count p as stopped. This isn't - perfect but it's a good approximation. */ - if (unlikely (p->ptrace) - && p->exit_code != SIGSTOP - && p->exit_code != SIGTSTP - && p->exit_code != SIGTTOU - && p->exit_code != SIGTTIN) - continue; - retval = 1; break; } while_each_task_pid(pgrp, PIDTYPE_PGID, p); @@ -267,7 +257,8 @@ static int has_stopped_jobs(int pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task. + * reparent_to_init - Reparent the calling kernel thread to the init task + * of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that @@ -285,16 +276,14 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper; - current->real_parent = child_reaper; + current->parent = child_reaper(current); + current->real_parent = child_reaper(current); add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if ((current->policy == SCHED_NORMAL || - current->policy == SCHED_BATCH) - && (task_nice(current) < 0)) + if (!has_rt_policy(current) && (task_nice(current) < 0)) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ @@ -311,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current->group_leader; - if (curr->signal->session != session) { + if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); - curr->signal->session = session; + set_signal_session(curr->signal, session); attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { @@ -323,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) } } -void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(pid_t session, pid_t pgrp) { write_lock_irq(&tasklist_lock); __set_special_pids(session, pgrp); @@ -393,9 +382,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - mutex_lock(&tty_mutex); - current->signal->tty = NULL; - mutex_unlock(&tty_mutex); + proc_clear_tty(current); /* Block and flush all signals */ sigfillset(&blocked); @@ -408,9 +395,11 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); - exit_namespace(current); - current->namespace = init_task.namespace; - get_namespace(current->namespace); + + exit_task_namespaces(current); + current->nsproxy = init_task.nsproxy; + get_task_namespaces(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -436,7 +425,7 @@ static void close_files(struct files_struct * files) for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j++]; while (set) { @@ -477,16 +466,26 @@ void fastcall put_files_struct(struct files_struct *files) * you can free files immediately. */ fdt = files_fdtable(files); - if (fdt == &files->fdtab) - fdt->free_files = files; - else + if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); - free_fdtable(fdt); + call_rcu(&fdt->rcu, free_fdtable_rcu); } } EXPORT_SYMBOL(put_files_struct); +void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +{ + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} +EXPORT_SYMBOL(reset_files_struct); + static inline void __exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -644,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->signal->session == father->signal->session)) { + (process_session(p) == process_session(father))) { int pgrp = process_group(p); - if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + if (will_become_orphaned_pgrp(pgrp, NULL) && + has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } @@ -658,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * When we die, we re-parent all our children. * Try to give them to another thread in our thread * group, and if no such member exists, give it to - * the global child reaper process (ie "init") + * the child reaper process (ie "init") in our pid + * space. */ static void forget_original_parent(struct task_struct *father, struct list_head *to_release) @@ -669,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper; + reaper = child_reaper(father); break; } } while (reaper->exit_state); @@ -781,7 +782,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->signal->session == tsk->signal->session) && + (process_session(t) == process_session(tsk)) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); @@ -845,9 +846,7 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; - struct taskstats *tidstats; int group_dead; - unsigned int mycpu; profile_task_exit(tsk); @@ -857,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk == child_reaper)) - panic("Attempted to kill init!"); + if (unlikely(tsk == child_reaper(tsk))) { + if (tsk->nsproxy->pid_ns != &init_pid_ns) + tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; + else + panic("Attempted to kill init!"); + } + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; @@ -885,8 +889,6 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats, &mycpu); - acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -906,8 +908,8 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, group_dead, mycpu); - taskstats_exit_free(tidstats); + + taskstats_exit(tsk, group_dead); exit_mm(tsk); @@ -916,7 +918,6 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_namespace(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); @@ -931,6 +932,7 @@ fastcall NORET_TYPE void do_exit(long code) tsk->exit_code = code; proc_exit_connector(tsk); exit_notify(tsk); + exit_task_namespaces(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; @@ -954,15 +956,15 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* PF_DEAD causes final put_task_struct after we schedule. */ preempt_disable(); - BUG_ON(tsk->flags & PF_DEAD); - tsk->flags |= PF_DEAD; + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; schedule(); BUG(); /* Avoid "noreturn function does return". */ - for (;;) ; + for (;;) + cpu_relax(); /* For when BUG is null */ } EXPORT_SYMBOL_GPL(do_exit); @@ -971,7 +973,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); - + do_exit(code); } diff --git a/kernel/fork.c b/kernel/fork.c index a0dad84567c9..d16c566eb645 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -18,7 +18,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> @@ -27,6 +27,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -35,6 +36,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> @@ -42,6 +44,7 @@ #include <linux/profile.h> #include <linux/rmap.h> #include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> @@ -80,26 +83,26 @@ int nr_processes(void) #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) -static kmem_cache_t *task_struct_cachep; +static struct kmem_cache *task_struct_cachep; #endif /* SLAB cache for signal_struct structures (tsk->signal) */ -static kmem_cache_t *signal_cachep; +static struct kmem_cache *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ -kmem_cache_t *sighand_cachep; +struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ -kmem_cache_t *files_cachep; +struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ -kmem_cache_t *fs_cachep; +struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -kmem_cache_t *vm_area_cachep; +struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ -static kmem_cache_t *mm_cachep; +static struct kmem_cache *mm_cachep; void free_task(struct task_struct *tsk) { @@ -183,7 +186,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); +#ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; +#endif tsk->splice_pipe = NULL; return tsk; } @@ -233,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -248,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) anon_vma_link(tmp); file = tmp->vm_file; if (file) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); @@ -315,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) #include <linux/init_task.h> @@ -444,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) tsk->vfork_done = NULL; complete(vfork_done); } - if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + + /* + * If we're exiting normally, clear a user-space tid field if + * requested. We leave this alone when dying by signal, to leave + * the value intact in a core dump, and to save the unnecessary + * trouble otherwise. Userland only wants this done for a sys_exit. + */ + if (tsk->clear_child_tid + && !(tsk->flags & PF_SIGNALED) + && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; @@ -475,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + if (!mm_init(mm)) goto fail_nomem; @@ -538,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) goto fail_nomem; good_mm: + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + tsk->mm = mm; tsk->active_mm = mm; return 0; @@ -592,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fdset; + int size = fdt->max_fds; int i; /* Find the last open fd */ @@ -609,7 +631,7 @@ static struct files_struct *alloc_files(void) struct files_struct *newf; struct fdtable *fdt; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; @@ -619,12 +641,10 @@ static struct files_struct *alloc_files(void) newf->next_fd = 0; fdt = &newf->fdtab; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = EMBEDDED_FD_SET_SIZE; fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); - fdt->free_files = NULL; fdt->next = NULL; rcu_assign_pointer(newf->fdt, fdt); out: @@ -640,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, expand; + int open_files, size, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -651,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); - size = old_fdt->max_fdset; open_files = count_open_files(old_fdt); - expand = 0; /* - * Check whether we need to allocate a larger fd array or fd set. - * Note: we're not a clone task, so the open count won't change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - if (open_files > new_fdt->max_fdset) { - new_fdt->max_fdset = 0; - expand = 1; - } if (open_files > new_fdt->max_fds) { new_fdt->max_fds = 0; - expand = 1; - } - - /* if the old fdset gets grown now, we'll only copy up to "size" fds */ - if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); *errorp = expand_files(newf, open_files-1); @@ -689,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -715,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (new_fdt->max_fdset > open_files) { - int left = (new_fdt->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&new_fdt->open_fds->fds_bits[start], 0, left); memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } -out: return newf; out_release: - free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); - free_fdset (new_fdt->open_fds, new_fdt->max_fdset); - free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); +out: return NULL; } @@ -826,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); - taskstats_tgid_alloc(current->signal); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); @@ -893,7 +900,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); - taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } @@ -980,6 +986,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + rt_mutex_init_task(p); + #ifdef CONFIG_TRACE_IRQFLAGS DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); @@ -1034,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ + task_io_accounting_init(p); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; @@ -1061,7 +1070,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + p->hardirqs_enabled = 1; +#else p->hardirqs_enabled = 0; +#endif p->hardirq_enable_ip = 0; p->hardirq_enable_event = 0; p->hardirq_disable_ip = _THIS_IP_; @@ -1080,8 +1093,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->lockdep_recursion = 0; #endif - rt_mutex_init_task(p); - #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif @@ -1109,11 +1120,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* @@ -1144,7 +1155,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ @@ -1167,6 +1177,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ + p->ioprio = current->ioprio; + /* * The task hasn't been attached yet, so its cpus_allowed mask will * not be changed, nor will its assigned CPU. @@ -1203,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; } if (clone_flags & CLONE_THREAD) { @@ -1226,11 +1239,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } } - /* - * inherit ioprio - */ - p->ioprio = current->ioprio; - if (likely(p->pid)) { add_parent(p); if (unlikely(p->ptrace & PT_PTRACED)) @@ -1239,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; + set_signal_session(p->signal, process_session(current)); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); + attach_pid(p, PIDTYPE_SID, process_session(p)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; @@ -1256,8 +1264,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: @@ -1299,7 +1307,7 @@ fork_out: return ERR_PTR(retval); } -struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs; @@ -1311,9 +1319,8 @@ struct task_struct * __devinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); - if (!task) - return ERR_PTR(-ENOMEM); - init_idle(task, cpu); + if (!IS_ERR(task)) + init_idle(task, cpu); return task; } @@ -1410,7 +1417,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) { struct sighand_struct *sighand = data; @@ -1506,18 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the namespace structure if it is being shared + * Unshare the mnt_namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); if (!*new_nsp) return -ENOMEM; } @@ -1526,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new } /* - * Unsharing of sighand for tasks created with CLONE_SIGHAND is not - * supported yet + * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) { struct sighand_struct *sigh = current->sighand; - if ((unshare_flags & CLONE_SIGHAND) && - (sigh && atomic_read(&sigh->count) > 1)) + if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) return -EINVAL; else return 0; @@ -1585,6 +1590,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1597,25 +1612,29 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct namespace *ns, *new_ns = NULL; - struct sighand_struct *sigh, *new_sigh = NULL; + struct mnt_namespace *ns, *new_ns = NULL; + struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; @@ -1625,11 +1644,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; + + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_fs || new_ns || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { task_lock(current); + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + if (new_fs) { fs = current->fs; current->fs = new_fs; @@ -1637,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; new_ns = ns; } - if (new_sigh) { - sigh = current->sighand; - rcu_assign_pointer(current->sighand, new_sigh); - new_sigh = sigh; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1663,9 +1695,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); @@ -1681,7 +1737,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_ns: if (new_ns) - put_namespace(new_ns); + put_mnt_ns(new_ns); bad_unshare_cleanup_fs: if (new_fs) diff --git a/kernel/futex.c b/kernel/futex.c index 9d260e838cff..5a737de857d3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) /* * Get parameters which are the keys for a futex. * - * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, + * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) /* * Linear file mappings are also simple. */ - key->shared.inode = vma->vm_file->f_dentry->d_inode; + key->shared.inode = vma->vm_file->f_path.dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) @@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; - inc_preempt_count(); + pagefault_disable(); ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); - dec_preempt_count(); + pagefault_enable(); return ret ? -EFAULT : 0; } @@ -324,12 +324,11 @@ static int refill_pi_state_cache(void) if (likely(current->pi_state_cache)) return 0; - pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); if (!pi_state) return -ENOMEM; - memset(pi_state, 0, sizeof(*pi_state)); INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; @@ -389,7 +388,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (!p) goto out_unlock; @@ -403,7 +402,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) } get_task_struct(p); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return p; } @@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q) * at the end of wake_up_all() does not prevent this store from * moving. */ - wmb(); + smp_wmb(); q->lock_ptr = NULL; } @@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!(uval & FUTEX_OWNER_DIED)) { newval = FUTEX_WAITERS | new_owner->pid; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + pagefault_enable(); if (curval == -EFAULT) return -EFAULT; if (curval != uval) @@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - inc_preempt_count(); + pagefault_disable(); oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); - dec_preempt_count(); + pagefault_enable(); if (oldval == -EFAULT) return oldval; @@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, */ newval = current->pid; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); - dec_preempt_count(); + pagefault_enable(); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, uval = curval; newval = uval | FUTEX_WAITERS; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + pagefault_enable(); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, newval = current->pid | FUTEX_OWNER_DIED | FUTEX_WAITERS; - inc_preempt_count(); + pagefault_disable(); curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + pagefault_enable(); if (unlikely(curval == -EFAULT)) goto uaddr_faulted; @@ -1390,9 +1389,9 @@ retry_locked: * anyone else up: */ if (!(uval & FUTEX_OWNER_DIED)) { - inc_preempt_count(); + pagefault_disable(); uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + pagefault_enable(); } if (unlikely(uval == -EFAULT)) @@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp, return ret; } -static struct file_operations futex_fops = { +static const struct file_operations futex_fops = { .release = futex_close, .poll = futex_poll, }; @@ -1507,6 +1506,13 @@ static int futex_fd(u32 __user *uaddr, int signal) struct futex_q *q; struct file *filp; int ret, err; + static unsigned long printk_interval; + + if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { + printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " + "will be removed from the kernel in June 2007\n", + current->comm); + } ret = -EINVAL; if (!valid_signal(signal)) @@ -1522,12 +1528,12 @@ static int futex_fd(u32 __user *uaddr, int signal) goto out; } filp->f_op = &futex_fops; - filp->f_vfsmnt = mntget(futex_mnt); - filp->f_dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_path.mnt = mntget(futex_mnt); + filp->f_path.dentry = dget(futex_mnt->mnt_root); + filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; if (signal) { - err = f_setown(filp, current->pid, 1); + err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); if (err < 0) { goto error; } @@ -1612,10 +1618,10 @@ sys_set_robust_list(struct robust_list_head __user *head, * @len_ptr: pointer to a length field, the kernel fills in the header size */ asmlinkage long -sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, +sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, size_t __user *len_ptr) { - struct robust_list_head *head; + struct robust_list_head __user *head; unsigned long ret; if (!pid) @@ -1624,7 +1630,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, struct task_struct *p; ret = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (!p) goto err_unlock; @@ -1633,7 +1639,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (put_user(sizeof(*head), len_ptr)) @@ -1641,7 +1647,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, return put_user(head, head_ptr); err_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } @@ -1694,14 +1700,15 @@ retry: * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user **head, int *pi) + struct robust_list __user * __user *head, + int *pi) { unsigned long uentry; - if (get_user(uentry, (unsigned long *)head)) + if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; - *entry = (void *)(uentry & ~1UL); + *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; @@ -1739,7 +1746,7 @@ void exit_robust_list(struct task_struct *curr) return; if (pending) - handle_futex_death((void *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1747,7 +1754,7 @@ void exit_robust_list(struct task_struct *curr) * don't process it twice: */ if (entry != pending) - if (handle_futex_death((void *)entry + futex_offset, + if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; /* @@ -1850,10 +1857,16 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { - unsigned int i; + int i = register_filesystem(&futex_fs_type); + + if (i) + return i; - register_filesystem(&futex_fs_type); futex_mnt = kern_mount(&futex_fs_type); + if (IS_ERR(futex_mnt)) { + unregister_filesystem(&futex_fs_type); + return PTR_ERR(futex_mnt); + } for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { INIT_LIST_HEAD(&futex_queues[i].chain); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index c5cca3f65cb7..50f24eea6cd0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -18,7 +18,7 @@ */ static inline int fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t *head, int *pi) + compat_uptr_t __user *head, int *pi) { if (get_user(*uentry, head)) return -EFAULT; @@ -62,7 +62,7 @@ void compat_exit_robust_list(struct task_struct *curr) &head->list_op_pending, &pip)) return; if (upending) - handle_futex_death((void *)pending + futex_offset, curr, pip); + handle_futex_death((void __user *)pending + futex_offset, curr, pip); while (compat_ptr(uentry) != &head->list) { /* @@ -70,7 +70,7 @@ void compat_exit_robust_list(struct task_struct *curr) * dont process it twice: */ if (entry != pending) - if (handle_futex_death((void *)entry + futex_offset, + if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; @@ -78,7 +78,7 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch the next entry in the list: */ if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t *)&entry->next, &pi)) + (compat_uptr_t __user *)&entry->next, &pi)) return; /* * Avoid excessively long or circular lists: @@ -103,10 +103,10 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, } asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr, +compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr) { - struct compat_robust_list_head *head; + struct compat_robust_list_head __user *head; unsigned long ret; if (!pid) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 21c38a7e666b..d0ba190dfeb6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return t->task == NULL; } -static long __sched nanosleep_restart(struct restart_block *restart) +long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; struct timespec __user *rmtp; @@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; - hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); - t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; + hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); + t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; if (do_nanosleep(&t, HRTIMER_ABS)) return 0; - rmtp = (struct timespec __user *) restart->arg2; + rmtp = (struct timespec __user *) restart->arg1; if (rmtp) { time = ktime_sub(t.timer.expires, t.timer.base->get_time()); if (time.tv64 <= 0) @@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) return -EFAULT; } - restart->fn = nanosleep_restart; + restart->fn = hrtimer_nanosleep_restart; /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; @@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, } restart = ¤t_thread_info()->restart_block; - restart->fn = nanosleep_restart; - restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; - restart->arg1 = t.timer.expires.tv64 >> 32; - restart->arg2 = (unsigned long) rmtp; - restart->arg3 = (unsigned long) t.timer.base->index; + restart->fn = hrtimer_nanosleep_restart; + restart->arg0 = (unsigned long) t.timer.base->index; + restart->arg1 = (unsigned long) rmtp; + restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = t.timer.expires.tv64 >> 32; return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ac1f850d4937..ebfd24a41858 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,6 +18,69 @@ #include "internals.h" /** + * dynamic_irq_init - initialize a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_init(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); + WARN_ON(1); + return; + } + + /* Ensure we don't have left over values from a previous use of this irq */ + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->status = IRQ_DISABLED; + desc->chip = &no_irq_chip; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->handler_data = NULL; + desc->chip_data = NULL; + desc->action = NULL; + desc->irq_count = 0; + desc->irqs_unhandled = 0; +#ifdef CONFIG_SMP + desc->affinity = CPU_MASK_ALL; +#endif + spin_unlock_irqrestore(&desc->lock, flags); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); + WARN_ON(1); + return; + } + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + if (desc->action) { + spin_unlock_irqrestore(&desc->lock, flags); + printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", + irq); + WARN_ON(1); + return; + } + desc->handle_irq = handle_bad_irq; + desc->chip = &no_irq_chip; + spin_unlock_irqrestore(&desc->lock, flags); +} + + +/** * set_irq_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure @@ -40,10 +103,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - /* - * For compatibility only: - */ - desc->chip = chip; spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -146,7 +205,7 @@ static void default_disable(unsigned int irq) struct irq_desc *desc = irq_desc + irq; if (!(desc->status & IRQ_DELAYED_DISABLE)) - irq_desc[irq].chip->mask(irq); + desc->chip->mask(irq); } /* @@ -174,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->shutdown = chip->disable; if (!chip->name) chip->name = chip->typename; + if (!chip->end) + chip->end = dummy_irq_chip.end; } static inline void mask_ack_irq(struct irq_desc *desc, int irq) @@ -190,7 +251,6 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Simple interrupts are either sent from a demultiplexing interrupt * handler or come from hardware, where no interrupt hardware control @@ -200,7 +260,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) * unmask issues if necessary. */ void fastcall -handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; @@ -220,9 +280,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) desc->status |= IRQ_INPROGRESS; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; @@ -234,7 +294,6 @@ out_unlock: * handle_level_irq - Level type irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Level type interrupts are active as long as the hardware line has * the active level. This may require to mask the interrupt and unmask @@ -242,7 +301,7 @@ out_unlock: * interrupt line is back to inactive. */ void fastcall -handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_level_irq(unsigned int irq, struct irq_desc *desc) { unsigned int cpu = smp_processor_id(); struct irqaction *action; @@ -270,9 +329,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; @@ -286,7 +345,6 @@ out_unlock: * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Only a single callback will be issued to the chip: an ->eoi() * call when the interrupt has been serviced. This enables support @@ -294,8 +352,7 @@ out_unlock: * details in hardware, transparently. */ void fastcall -handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, - struct pt_regs *regs) +handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { unsigned int cpu = smp_processor_id(); struct irqaction *action; @@ -323,9 +380,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; @@ -339,7 +396,6 @@ out: * handle_edge_irq - edge type IRQ handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Interrupt occures on the falling and/or rising edge of a hardware * signal. The occurence is latched into the irq controller hardware @@ -353,7 +409,7 @@ out: * loop is left. */ void fastcall -handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_edge_irq(unsigned int irq, struct irq_desc *desc) { const unsigned int cpu = smp_processor_id(); @@ -404,9 +460,9 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); @@ -421,12 +477,11 @@ out_unlock: * handle_percpu_IRQ - Per CPU local irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq - * @regs: pointer to a register structure * * Per CPU interrupts on SMP machines without locking requirements */ void fastcall -handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; @@ -435,9 +490,9 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) if (desc->chip->ack) desc->chip->ack(irq); - action_ret = handle_IRQ_event(irq, regs, desc->action); + action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); + note_interrupt(irq, desc, action_ret); if (desc->chip->eoi) desc->chip->eoi(irq); @@ -446,10 +501,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) #endif /* CONFIG_SMP */ void -__set_irq_handler(unsigned int irq, - void fastcall (*handle)(unsigned int, irq_desc_t *, - struct pt_regs *), - int is_chained) +__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { struct irq_desc *desc; unsigned long flags; @@ -490,6 +543,7 @@ __set_irq_handler(unsigned int irq, desc->depth = 1; } desc->handle_irq = handle; + desc->name = name; if (handle != handle_bad_irq && is_chained) { desc->status &= ~IRQ_DISABLED; @@ -502,36 +556,16 @@ __set_irq_handler(unsigned int irq, void set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - void fastcall (*handle)(unsigned int, - struct irq_desc *, - struct pt_regs *)) + irq_flow_handler_t handle) { set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0); + __set_irq_handler(irq, handle, 0, NULL); } -/* - * Get a descriptive string for the highlevel handler, for - * /proc/interrupts output: - */ -const char * -handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, - struct pt_regs *)) +void +set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) { - if (handle == handle_level_irq) - return "level "; - if (handle == handle_fasteoi_irq) - return "fasteoi"; - if (handle == handle_edge_irq) - return "edge "; - if (handle == handle_simple_irq) - return "simple "; -#ifdef CONFIG_SMP - if (handle == handle_percpu_irq) - return "percpu "; -#endif - if (handle == handle_bad_irq) - return "bad "; - - return NULL; + set_irq_chip(irq, chip); + __set_irq_handler(irq, handle, 0, name); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 4c6cdbaed661..aff1f0fabb0d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -27,7 +27,7 @@ * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ void fastcall -handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); kstat_this_cpu.irqs[irq]++; @@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif @@ -115,7 +115,7 @@ struct irq_chip dummy_irq_chip = { /* * Special, empty irq handler: */ -irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) +irqreturn_t no_action(int cpl, void *dev_id) { return IRQ_NONE; } @@ -123,13 +123,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number - * @regs: pointer to a register structure * @action: the interrupt action chain for this irq * * Handles the action chain of an irq event */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, - struct irqaction *action) +irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; @@ -140,7 +138,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, local_irq_enable_in_hardirq(); do { - ret = action->handler(irq, action->dev_id, regs); + ret = action->handler(irq, action->dev_id); if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; @@ -158,7 +156,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, /** * __do_IRQ - original all in one highlevel IRQ handler * @irq: the interrupt number - * @regs: pointer to a register structure * * __do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific @@ -167,7 +164,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, * This is the original x86 implementation which is used for every * interrupt type. */ -fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) +fastcall unsigned int __do_IRQ(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; @@ -182,7 +179,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) */ if (desc->chip->ack) desc->chip->ack(irq); - action_ret = handle_IRQ_event(irq, regs, desc->action); + action_ret = handle_IRQ_event(irq, desc->action); desc->chip->end(irq); return 1; } @@ -233,11 +230,11 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, regs, action); + action_ret = handle_IRQ_event(irq, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); spin_lock(&desc->lock); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret, regs); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 92be519eff26..b385878c6e80 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) { struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; + const char *old_name = NULL; unsigned long flags; int shared = 0; @@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) * set the trigger type must match. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + old_name = old->name; goto mismatch; + } #if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ @@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new) return 0; mismatch: - spin_unlock_irqrestore(&desc->lock, flags); if (!(new->flags & IRQF_PROBE_SHARED)) { printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); + if (old_name) + printk(KERN_ERR "current handler: %s\n", old_name); dump_stack(); } + spin_unlock_irqrestore(&desc->lock, flags); return -EBUSY; } @@ -427,8 +432,7 @@ EXPORT_SYMBOL(free_irq); * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * */ -int request_irq(unsigned int irq, - irqreturn_t (*handler)(int, void *, struct pt_regs *), +int request_irq(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; @@ -475,4 +479,3 @@ int request_irq(unsigned int irq, return retval; } EXPORT_SYMBOL(request_irq); - diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index a57ebe9fa6f6..4baa3bbcd25a 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,17 +7,17 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) unsigned long flags; spin_lock_irqsave(&desc->lock, flags); - desc->move_irq = 1; + desc->status |= IRQ_MOVE_PENDING; irq_desc[irq].pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } -void move_native_irq(int irq) +void move_masked_irq(int irq) { struct irq_desc *desc = irq_desc + irq; cpumask_t tmp; - if (likely(!desc->move_irq)) + if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; /* @@ -28,7 +28,7 @@ void move_native_irq(int irq) return; } - desc->move_irq = 0; + desc->status &= ~IRQ_MOVE_PENDING; if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) return; @@ -48,15 +48,29 @@ void move_native_irq(int irq) * when an active trigger is comming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! + * + * For correct operation this depends on the caller + * masking the irqs. */ if (likely(!cpus_empty(tmp))) { - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->disable(irq); - desc->chip->set_affinity(irq,tmp); - - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->enable(irq); } cpus_clear(irq_desc[irq].pending_mask); } + +void move_native_irq(int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (likely(!(desc->status & IRQ_MOVE_PENDING))) + return; + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->chip->disable(irq); + + move_masked_irq(irq); + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->chip->enable(irq); +} + diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 607c7809ad01..61f5c717a8f5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -54,10 +54,11 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned int irq = (int)(long)data, full_count = count, err; cpumask_t new_value, tmp; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) + if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + CHECK_IRQ_PER_CPU(irq_desc[irq].status)) return -EIO; - err = cpumask_parse(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, new_value); if (err) return err; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 35f10f7ff94a..5bfeaed7e487 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg) clear_bit(irq, irqs_resend); desc = irq_desc + irq; local_irq_disable(); - desc->handle_irq(irq, desc, NULL); + desc->handle_irq(irq, desc); local_irq_enable(); } } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 417e98092cf2..543ea2e5ad93 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -16,7 +16,7 @@ static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. */ -static int misrouted_irq(int irq, struct pt_regs *regs) +static int misrouted_irq(int irq) { int i; int ok = 0; @@ -49,7 +49,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) while (action) { /* Only shared IRQ handlers are safe to call */ if (action->flags & IRQF_SHARED) { - if (action->handler(i, action->dev_id, regs) == + if (action->handler(i, action->dev_id) == IRQ_HANDLED) ok = 1; } @@ -70,7 +70,7 @@ static int misrouted_irq(int irq, struct pt_regs *regs) */ work = 1; spin_unlock(&desc->lock); - handle_IRQ_event(i, regs, action); + handle_IRQ_event(i, action); spin_lock(&desc->lock); desc->status &= ~IRQ_PENDING; } @@ -136,7 +136,7 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } void note_interrupt(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret, struct pt_regs *regs) + irqreturn_t action_ret) { if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; @@ -147,7 +147,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { - int ok = misrouted_irq(irq, regs); + int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab16a5a4cfe9..6f294ff4f9ee 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -20,6 +20,7 @@ #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/mm.h> +#include <linux/ctype.h> #include <asm/sections.h> @@ -30,14 +31,14 @@ #endif /* These will be re-linked against their real values during the second link stage */ -extern unsigned long kallsyms_addresses[] __attribute__((weak)); -extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); -extern u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __attribute__((weak)); +extern const unsigned long kallsyms_num_syms __attribute__((weak)); +extern const u8 kallsyms_names[] __attribute__((weak)); -extern u8 kallsyms_token_table[] __attribute__((weak)); -extern u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __attribute__((weak)); +extern const u16 kallsyms_token_index[] __attribute__((weak)); -extern unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __attribute__((weak)); static inline int is_kernel_inittext(unsigned long addr) { @@ -69,12 +70,21 @@ static inline int is_kernel(unsigned long addr) return in_gate_area_no_task(addr); } +static int is_ksym_addr(unsigned long addr) +{ + if (all_var) + return is_kernel(addr); + + return is_kernel_text(addr) || is_kernel_inittext(addr) || + is_kernel_extratext(addr); +} + /* expand a compressed symbol data into the resulting uncompressed string, given the offset to where the symbol is in the compressed stream */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) { int len, skipped_first = 0; - u8 *tptr, *data; + const u8 *tptr, *data; /* get the compressed symbol length from the first symbol byte */ data = &kallsyms_names[off]; @@ -122,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) * kallsyms array */ static unsigned int get_symbol_offset(unsigned long pos) { - u8 *name; + const u8 *name; int i; /* use the closest marker we have. We have markers every 256 positions, @@ -154,7 +164,73 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); + +static unsigned long get_symbol_pos(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset) +{ + unsigned long symbol_start = 0, symbol_end = 0; + unsigned long i, low, high, mid; + + /* This kernel should never had been booted. */ + BUG_ON(!kallsyms_addresses); + + /* do a binary search on the sorted kallsyms_addresses array */ + low = 0; + high = kallsyms_num_syms; + + while (high - low > 1) { + mid = (low + high) / 2; + if (kallsyms_addresses[mid] <= addr) + low = mid; + else + high = mid; + } + + /* + * search for the first aliased symbol. Aliased + * symbols are symbols with the same address + */ + while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + --low; + + symbol_start = kallsyms_addresses[low]; + + /* Search for next non-aliased symbol */ + for (i = low + 1; i < kallsyms_num_syms; i++) { + if (kallsyms_addresses[i] > symbol_start) { + symbol_end = kallsyms_addresses[i]; + break; + } + } + + /* if we found no next symbol, we use the end of the section */ + if (!symbol_end) { + if (is_kernel_inittext(addr)) + symbol_end = (unsigned long)_einittext; + else if (all_var) + symbol_end = (unsigned long)_end; + else + symbol_end = (unsigned long)_etext; + } + + *symbolsize = symbol_end - symbol_start; + *offset = addr - symbol_start; + + return low; +} + +/* + * Lookup an address but don't bother to find any names. + */ +int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, + unsigned long *offset) +{ + if (is_ksym_addr(addr)) + return !!get_symbol_pos(addr, symbolsize, offset); + + return !!module_address_lookup(addr, symbolsize, offset, NULL); +} /* * Lookup an address @@ -168,57 +244,18 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - unsigned long i, low, high, mid; const char *msym; - /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); - namebuf[KSYM_NAME_LEN] = 0; namebuf[0] = 0; - if ((all_var && is_kernel(addr)) || - (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) || - is_kernel_extratext(addr)))) { - unsigned long symbol_end = 0; - - /* do a binary search on the sorted kallsyms_addresses array */ - low = 0; - high = kallsyms_num_syms; - - while (high-low > 1) { - mid = (low + high) / 2; - if (kallsyms_addresses[mid] <= addr) low = mid; - else high = mid; - } - - /* search for the first aliased symbol. Aliased symbols are - symbols with the same address */ - while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low]) - --low; + if (is_ksym_addr(addr)) { + unsigned long pos; + pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(low), namebuf); - - /* Search for next non-aliased symbol */ - for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > kallsyms_addresses[low]) { - symbol_end = kallsyms_addresses[i]; - break; - } - } - - /* if we found no next symbol, we use the end of the section */ - if (!symbol_end) { - if (is_kernel_inittext(addr)) - symbol_end = (unsigned long)_einittext; - else - symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext; - } - - *symbolsize = symbol_end - kallsyms_addresses[low]; + kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); *modname = NULL; - *offset = addr - kallsyms_addresses[low]; return namebuf; } @@ -265,13 +302,6 @@ struct kallsym_iter char name[KSYM_NAME_LEN+1]; }; -/* Only label it "global" if it is exported. */ -static void upcase_if_global(struct kallsym_iter *iter) -{ - if (is_exported(iter->name, iter->owner)) - iter->type += 'A' - 'a'; -} - static int get_ksymbol_mod(struct kallsym_iter *iter) { iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, @@ -280,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) if (iter->owner == NULL) return 0; - upcase_if_global(iter); + /* Label it "global" if it is exported, "local" if not exported. */ + iter->type = is_exported(iter->name, iter->owner) + ? toupper(iter->type) : tolower(iter->type); + return 1; } @@ -365,7 +398,7 @@ static int s_show(struct seq_file *m, void *p) return 0; } -static struct seq_operations kallsyms_op = { +static const struct seq_operations kallsyms_op = { .start = s_start, .next = s_next, .stop = s_stop, @@ -400,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static struct file_operations kallsyms_operations = { +static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/kexec.c b/kernel/kexec.c index 50087ecf337e..2a59c8a01ae0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -20,6 +20,8 @@ #include <linux/syscalls.h> #include <linux/ioport.h> #include <linux/hardirq.h> +#include <linux/elf.h> +#include <linux/elfcore.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -40,7 +42,7 @@ struct resource crashk_res = { int kexec_should_crash(struct task_struct *p) { - if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) + if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) return 1; return 0; } @@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Allocate a controlling structure */ result = -ENOMEM; - image = kmalloc(sizeof(*image), GFP_KERNEL); + image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) goto out; - memset(image, 0, sizeof(*image)); image->head = 0; image->entry = &image->head; image->last_entry = &image->head; @@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image, memset(ptr + uchunk, 0, mchunk - uchunk); } result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); kunmap(page); if (result) { result = (result < 0) ? result : -EIO; @@ -995,7 +997,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: - xchg(&kexec_lock, 0); /* Release the mutex */ + locked = xchg(&kexec_lock, 0); /* Release the mutex */ + BUG_ON(!locked); kimage_free(image); return result; @@ -1061,10 +1064,65 @@ void crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - xchg(&kexec_lock, 0); + locked = xchg(&kexec_lock, 0); + BUG_ON(!locked); } } +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 64ab045c3d9d..5d1d907378a2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, len = min(len, fifo->size - fifo->in + fifo->out); + /* + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. + */ + + smp_mb(); + /* first put the data starting from fifo->in to buffer end */ l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); @@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, /* then put the rest (if any) at the beginning of the buffer */ memcpy(fifo->buffer, buffer + l, len - l); + /* + * Ensure that we add the bytes to the kfifo -before- + * we update the fifo->in index. + */ + + smp_wmb(); + fifo->in += len; return len; @@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, len = min(len, fifo->in - fifo->out); + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + /* first get the data from fifo->out until the end of the buffer */ l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); @@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, /* then get the rest (if any) from the beginning of the buffer */ memcpy(buffer + l, fifo->buffer, len - l); + /* + * Ensure that we remove the bytes from the kfifo -before- + * we update the fifo->out index. + */ + + smp_mb(); + fifo->out += len; return len; diff --git a/kernel/kmod.c b/kernel/kmod.c index 5c470c57fb57..3a7379aa31ca 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -18,8 +18,6 @@ call_usermodehelper wait flag, and remove exec_usermodehelper. Rusty Russell <rusty@rustcorp.com.au> Jan 2003 */ -#define __KERNEL_SYSCALLS__ - #include <linux/module.h> #include <linux/sched.h> #include <linux/syscalls.h> @@ -27,7 +25,7 @@ #include <linux/kmod.h> #include <linux/smp_lock.h> #include <linux/slab.h> -#include <linux/namespace.h> +#include <linux/mnt_namespace.h> #include <linux/completion.h> #include <linux/file.h> #include <linux/workqueue.h> @@ -35,6 +33,7 @@ #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/resource.h> #include <asm/uaccess.h> extern int max_threads; @@ -115,6 +114,7 @@ EXPORT_SYMBOL(request_module); #endif /* CONFIG_KMOD */ struct subprocess_info { + struct work_struct work; struct completion *complete; char *path; char **argv; @@ -122,6 +122,7 @@ struct subprocess_info { struct key *ring; int wait; int retval; + struct file *stdin; }; /* @@ -145,12 +146,30 @@ static int ____call_usermodehelper(void *data) key_put(old_session); + /* Install input pipe when needed */ + if (sub_info->stdin) { + struct files_struct *f = current->files; + struct fdtable *fdt; + /* no races because files should be private here */ + sys_close(0); + fd_install(0, sub_info->stdin); + spin_lock(&f->file_lock); + fdt = files_fdtable(f); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&f->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; + } + /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); retval = -EPERM; if (current->fs->root) - retval = execve(sub_info->path, sub_info->argv,sub_info->envp); + retval = kernel_execve(sub_info->path, + sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; @@ -176,6 +195,8 @@ static int wait_for_helper(void *data) if (pid < 0) { sub_info->retval = pid; } else { + int ret; + /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. @@ -185,7 +206,15 @@ static int wait_for_helper(void *data) * * Thus the __user pointer cast is valid here. */ - sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either ____call_usermodehelper failed and the + * real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; } complete(sub_info->complete); @@ -193,9 +222,10 @@ static int wait_for_helper(void *data) } /* This is run by khelper thread */ -static void __call_usermodehelper(void *data) +static void __call_usermodehelper(struct work_struct *work) { - struct subprocess_info *sub_info = data; + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); pid_t pid; int wait = sub_info->wait; @@ -236,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, { DECLARE_COMPLETION_ONSTACK(done); struct subprocess_info sub_info = { + .work = __WORK_INITIALIZER(sub_info.work, + __call_usermodehelper), .complete = &done, .path = path, .argv = argv, @@ -244,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, .wait = wait, .retval = 0, }; - DECLARE_WORK(work, __call_usermodehelper, &sub_info); if (!khelper_wq) return -EBUSY; @@ -252,12 +283,51 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, if (path[0] == '\0') return 0; - queue_work(khelper_wq, &work); + queue_work(khelper_wq, &sub_info.work); wait_for_completion(&done); return sub_info.retval; } EXPORT_SYMBOL(call_usermodehelper_keys); +int call_usermodehelper_pipe(char *path, char **argv, char **envp, + struct file **filp) +{ + DECLARE_COMPLETION(done); + struct subprocess_info sub_info = { + .work = __WORK_INITIALIZER(sub_info.work, + __call_usermodehelper), + .complete = &done, + .path = path, + .argv = argv, + .envp = envp, + .retval = 0, + }; + struct file *f; + + if (!khelper_wq) + return -EBUSY; + + if (path[0] == '\0') + return 0; + + f = create_write_pipe(); + if (IS_ERR(f)) + return PTR_ERR(f); + *filp = f; + + f = create_read_pipe(f); + if (IS_ERR(f)) { + free_write_pipe(*filp); + return PTR_ERR(f); + } + sub_info.stdin = f; + + queue_work(khelper_wq, &sub_info.work); + wait_for_completion(&done); + return sub_info.retval; +} +EXPORT_SYMBOL(call_usermodehelper_pipe); + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3f57dfdc8f92..17ec4afb0994 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,8 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/kallsyms.h> +#include <linux/freezer.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> @@ -45,6 +47,16 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +/* + * Some oddball architectures like 64bit powerpc have function descriptors + * so this must be overridable. + */ +#ifndef kprobe_lookup_name +#define kprobe_lookup_name(name, addr) \ + addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) +#endif + static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; @@ -72,9 +84,36 @@ struct kprobe_insn_page { kprobe_opcode_t *insns; /* Page of instruction slots */ char slot_used[INSNS_PER_PAGE]; int nused; + int ngarbage; }; static struct hlist_head kprobe_insn_pages; +static int kprobe_garbage_slots; +static int collect_garbage_slots(void); + +static int __kprobes check_safety(void) +{ + int ret = 0; +#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) + ret = freeze_processes(); + if (ret == 0) { + struct task_struct *p, *q; + do_each_thread(p, q) { + if (p != current && p->state == TASK_RUNNING && + p->pid != 0) { + printk("Check failed: %s is running\n",p->comm); + ret = -1; + goto loop_end; + } + } while_each_thread(p, q); + } +loop_end: + thaw_processes(); +#else + synchronize_sched(); +#endif + return ret; +} /** * get_insn_slot() - Find a slot on an executable page for an instruction. @@ -85,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) struct kprobe_insn_page *kip; struct hlist_node *pos; + retry: hlist_for_each(pos, &kprobe_insn_pages) { kip = hlist_entry(pos, struct kprobe_insn_page, hlist); if (kip->nused < INSNS_PER_PAGE) { @@ -101,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } } - /* All out of space. Need to allocate a new page. Use slot 0.*/ + /* If there are any garbage slots, collect it and try again. */ + if (kprobe_garbage_slots && collect_garbage_slots() == 0) { + goto retry; + } + /* All out of space. Need to allocate a new page. Use slot 0. */ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); if (!kip) { return NULL; @@ -122,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) memset(kip->slot_used, 0, INSNS_PER_PAGE); kip->slot_used[0] = 1; kip->nused = 1; + kip->ngarbage = 0; return kip->insns; } -void __kprobes free_insn_slot(kprobe_opcode_t *slot) +/* Return 1 if all garbages are collected, otherwise 0. */ +static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +{ + kip->slot_used[idx] = 0; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + hlist_del(&kip->hlist); + if (hlist_empty(&kprobe_insn_pages)) { + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, + &kprobe_insn_pages); + } else { + module_free(NULL, kip->insns); + kfree(kip); + } + return 1; + } + return 0; +} + +static int __kprobes collect_garbage_slots(void) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos, *next; + + /* Ensure no-one is preepmted on the garbages */ + if (check_safety() != 0) + return -EAGAIN; + + hlist_for_each_safe(pos, next, &kprobe_insn_pages) { + int i; + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->ngarbage == 0) + continue; + kip->ngarbage = 0; /* we will collect all garbages */ + for (i = 0; i < INSNS_PER_PAGE; i++) { + if (kip->slot_used[i] == -1 && + collect_one_slot(kip, i)) + break; + } + } + kprobe_garbage_slots = 0; + return 0; +} + +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -135,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } + if (dirty) { + kip->slot_used[i] = -1; + kip->ngarbage++; + } else { + collect_one_slot(kip, i); } - return; + break; } } + if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { + collect_garbage_slots(); + } } #endif @@ -308,7 +394,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) } /* Called with kretprobe_lock held */ -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -320,7 +407,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->free_instances); } else /* Unregistering */ - kfree(ri); + hlist_add_head(&ri->hlist, head); } struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) @@ -336,18 +423,24 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) */ void __kprobes kprobe_flush_task(struct task_struct *tk) { - struct kretprobe_instance *ri; - struct hlist_head *head; + struct kretprobe_instance *ri; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags = 0; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(tk); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); - } + head = kretprobe_inst_table_head(tk); + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task == tk) + recycle_rp_inst(ri, &empty_rp); + } spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } } static inline void free_rp_inst(struct kretprobe *rp) @@ -447,6 +540,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct kprobe *old_p; struct module *probed_mod; + /* + * If we have a symbol_name argument look it up, + * and add it to the address. That way the addr + * field can either be global or relative to a symbol. + */ + if (p->symbol_name) { + if (p->addr) + return -EINVAL; + kprobe_lookup_name(p->symbol_name, p->addr); + } + + if (!p->addr) + return -EINVAL; + p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + if ((!kernel_text_address((unsigned long) p->addr)) || in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; @@ -488,7 +596,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, (ARCH_INACTIVE_KPROBE_COUNT + 1)) register_page_fault_notifier(&kprobe_page_fault_nb); - arch_arm_kprobe(p); + arch_arm_kprobe(p); out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/kthread.c b/kernel/kthread.c index 4f9c60ef95e8..1db8c72d0d38 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -31,6 +31,8 @@ struct kthread_create_info /* Result passed back to kthread_create() from keventd. */ struct task_struct *result; struct completion done; + + struct work_struct work; }; struct kthread_stop_info @@ -111,9 +113,10 @@ static int kthread(void *_create) } /* We are keventd: create a thread. */ -static void keventd_create_kthread(void *_create) +static void keventd_create_kthread(struct work_struct *work) { - struct kthread_create_info *create = _create; + struct kthread_create_info *create = + container_of(work, struct kthread_create_info, work); int pid; /* We want our own signal handler (we take no signals by default). */ @@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), ...) { struct kthread_create_info create; - DECLARE_WORK(work, keventd_create_kthread, &create); create.threadfn = threadfn; create.data = data; init_completion(&create.started); init_completion(&create.done); + INIT_WORK(&create.work, keventd_create_kthread); /* * The workqueue needs to start up first: */ if (!helper_wq) - work.func(work.data); + create.work.func(&create.work); else { - queue_work(helper_wq, &work); + queue_work(helper_wq, &create.work); wait_for_completion(&create.done); } if (!IS_ERR(create.result)) { diff --git a/kernel/latency.c b/kernel/latency.c new file mode 100644 index 000000000000..e63fcacb61a7 --- /dev/null +++ b/kernel/latency.c @@ -0,0 +1,280 @@ +/* + * latency.c: Explicit system-wide latency-expectation infrastructure + * + * The purpose of this infrastructure is to allow device drivers to set + * latency constraint they have and to collect and summarize these + * expectations globally. The cummulated result can then be used by + * power management and similar users to make decisions that have + * tradoffs with a latency component. + * + * An example user of this are the x86 C-states; each higher C state saves + * more power, but has a higher exit latency. For the idle loop power + * code to make a good decision which C-state to use, information about + * acceptable latencies is required. + * + * An example announcer of latency is an audio driver that knowns it + * will get an interrupt when the hardware has 200 usec of samples + * left in the DMA buffer; in that case the driver can set a latency + * constraint of, say, 150 usec. + * + * Multiple drivers can each announce their maximum accepted latency, + * to keep these appart, a string based identifier is used. + * + * + * (C) Copyright 2006 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/latency.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <asm/atomic.h> + +struct latency_info { + struct list_head list; + int usecs; + char *identifier; +}; + +/* + * locking rule: all modifications to current_max_latency and + * latency_list need to be done while holding the latency_lock. + * latency_lock needs to be taken _irqsave. + */ +static atomic_t current_max_latency; +static DEFINE_SPINLOCK(latency_lock); + +static LIST_HEAD(latency_list); +static BLOCKING_NOTIFIER_HEAD(latency_notifier); + +/* + * This function returns the maximum latency allowed, which + * happens to be the minimum of all maximum latencies on the + * list. + */ +static int __find_max_latency(void) +{ + int min = INFINITE_LATENCY; + struct latency_info *info; + + list_for_each_entry(info, &latency_list, list) { + if (info->usecs < min) + min = info->usecs; + } + return min; +} + +/** + * set_acceptable_latency - sets the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function sleeps and can only be called from process + * context. + * Calling this function with an existing identifier is valid + * and will cause the existing latency setting to be changed. + */ +void set_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *info, *iter; + unsigned long flags; + int found_old = 0; + + info = kzalloc(sizeof(struct latency_info), GFP_KERNEL); + if (!info) + return; + info->usecs = usecs; + info->identifier = kstrdup(identifier, GFP_KERNEL); + if (!info->identifier) + goto free_info; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier)==0) { + found_old = 1; + iter->usecs = usecs; + break; + } + } + if (!found_old) + list_add(&info->list, &latency_list); + + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + + spin_unlock_irqrestore(&latency_lock, flags); + + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); + + /* + * if we inserted the new one, we're done; otherwise there was + * an existing one so we need to free the redundant data + */ + if (!found_old) + return; + + kfree(info->identifier); +free_info: + kfree(info); +} +EXPORT_SYMBOL_GPL(set_acceptable_latency); + +/** + * modify_acceptable_latency - changes the maximum latency acceptable + * @identifier: string that identifies this driver + * @usecs: maximum acceptable latency for this driver + * + * This function informs the kernel that this device(driver) + * can accept at most usecs latency. This setting is used for + * power management and similar tradeoffs. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + * + * Due to the atomic nature of this function, the modified latency + * value will only be used for future decisions; past decisions + * can still lead to longer latencies in the near future. + */ +void modify_acceptable_latency(char *identifier, int usecs) +{ + struct latency_info *iter; + unsigned long flags; + + spin_lock_irqsave(&latency_lock, flags); + list_for_each_entry(iter, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + iter->usecs = usecs; + break; + } + } + if (usecs < atomic_read(¤t_max_latency)) + atomic_set(¤t_max_latency, usecs); + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(modify_acceptable_latency); + +/** + * remove_acceptable_latency - removes the maximum latency acceptable + * @identifier: string that identifies this driver + * + * This function removes a previously set maximum latency setting + * for the driver and frees up any resources associated with the + * bookkeeping needed for this. + * + * This function does not sleep and can be called in any context. + * Trying to use a non-existing identifier silently gets ignored. + */ +void remove_acceptable_latency(char *identifier) +{ + unsigned long flags; + int newmax = 0; + struct latency_info *iter, *temp; + + spin_lock_irqsave(&latency_lock, flags); + + list_for_each_entry_safe(iter, temp, &latency_list, list) { + if (strcmp(iter->identifier, identifier) == 0) { + list_del(&iter->list); + newmax = iter->usecs; + kfree(iter->identifier); + kfree(iter); + break; + } + } + + /* If we just deleted the system wide value, we need to + * recalculate with a full search + */ + if (newmax == atomic_read(¤t_max_latency)) { + newmax = __find_max_latency(); + atomic_set(¤t_max_latency, newmax); + } + spin_unlock_irqrestore(&latency_lock, flags); +} +EXPORT_SYMBOL_GPL(remove_acceptable_latency); + +/** + * system_latency_constraint - queries the system wide latency maximum + * + * This function returns the system wide maximum latency in + * microseconds. + * + * This function does not sleep and can be called in any context. + */ +int system_latency_constraint(void) +{ + return atomic_read(¤t_max_latency); +} +EXPORT_SYMBOL_GPL(system_latency_constraint); + +/** + * synchronize_acceptable_latency - recalculates all latency decisions + * + * This function will cause a callback to various kernel pieces that + * will make those pieces rethink their latency decisions. This implies + * that if there are overlong latencies in hardware state already, those + * latencies get taken right now. When this call completes no overlong + * latency decisions should be active anymore. + * + * Typical usecase of this is after a modify_acceptable_latency() call, + * which in itself is non-blocking and non-synchronizing. + * + * This function blocks and should not be called with locks held. + */ + +void synchronize_acceptable_latency(void) +{ + blocking_notifier_call_chain(&latency_notifier, + atomic_read(¤t_max_latency), NULL); +} +EXPORT_SYMBOL_GPL(synchronize_acceptable_latency); + +/* + * Latency notifier: this notifier gets called when a non-atomic new + * latency value gets set. The expectation nof the caller of the + * non-atomic set is that when the call returns, future latencies + * are within bounds, so the functions on the notifier list are + * expected to take the overlong latencies immediately, inside the + * callback, and not make a overlong latency decision anymore. + * + * The callback gets called when the new latency value is made + * active so system_latency_constraint() returns the new latency. + */ +int register_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_register(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(register_latency_notifier); + +int unregister_latency_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_unregister(&latency_notifier, nb); +} +EXPORT_SYMBOL_GPL(unregister_latency_notifier); + +static __init int latency_init(void) +{ + atomic_set(¤t_max_latency, INFINITE_LATENCY); + /* + * we don't want by default to have longer latencies than 2 ticks, + * since that would cause lost ticks + */ + set_acceptable_latency("kernel", 2*1000000/HZ); + return 0; +} + +module_init(latency_init); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c088e5542e84..b02032476dc2 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -36,6 +36,7 @@ #include <linux/stacktrace.h> #include <linux/debug_locks.h> #include <linux/irqflags.h> +#include <linux/utsname.h> #include <asm/sections.h> @@ -121,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE]; * unique. */ #define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ + (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ + ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ (key2)) void lockdep_off(void) @@ -139,13 +140,6 @@ void lockdep_on(void) EXPORT_SYMBOL(lockdep_on); -int lockdep_internal(void) -{ - return current->lockdep_recursion != 0; -} - -EXPORT_SYMBOL(lockdep_internal); - /* * Debugging switches: */ @@ -227,17 +221,15 @@ static int save_trace(struct stack_trace *trace) trace->skip = 3; trace->all_contexts = 0; - /* Make sure to not recurse in case the the unwinder needs to tak -e locks. */ - lockdep_off(); save_stack_trace(trace, NULL); - lockdep_on(); trace->max_entries = trace->nr_entries; nr_stack_trace_entries += trace->nr_entries; - if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) + if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) { + __raw_spin_unlock(&hash_lock); return 0; + } if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { __raw_spin_unlock(&hash_lock); @@ -356,7 +348,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 static void print_lock_name(struct lock_class *class) { - char str[128], c1, c2, c3, c4; + char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; const char *name; get_usage_chars(class, &c1, &c2, &c3, &c4); @@ -378,7 +370,7 @@ static void print_lock_name(struct lock_class *class) static void print_lockdep_cache(struct lockdep_map *lock) { const char *name; - char str[128]; + char str[KSYM_NAME_LEN + 1]; name = lock->name; if (!name) @@ -448,7 +440,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) print_lock_class_header(class, depth); list_for_each_entry(entry, &class->locks_after, entry) { - DEBUG_LOCKS_WARN_ON(!entry->class); + if (DEBUG_LOCKS_WARN_ON(!entry->class)) + return; + print_lock_dependencies(entry->class, depth + 1); printk("%*s ... acquired at:\n",depth,""); @@ -473,7 +467,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 0; entry->class = this; - save_trace(&entry->trace); + if (!save_trace(&entry->trace)) + return 0; /* * Since we never remove from the dependency list, the list can @@ -515,6 +510,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) return 0; } +static void print_kernel_version(void) +{ + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +} + /* * When a circular dependency is detected, print the * header first: @@ -531,6 +533,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) printk("\n=======================================================\n"); printk( "[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_version(); printk( "-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, curr->pid); @@ -553,8 +556,12 @@ static noinline int print_circular_bug_tail(void) if (debug_locks_silent) return 0; + /* hash_lock unlocked by the header */ + __raw_spin_lock(&hash_lock); this.class = check_source->class; - save_trace(&this.trace); + if (!save_trace(&this.trace)) + return 0; + __raw_spin_unlock(&hash_lock); print_circular_bug_entry(&this, 0); printk("\nother info that might help us debug this:\n\n"); @@ -566,6 +573,8 @@ static noinline int print_circular_bug_tail(void) return 0; } +#define RECURSION_LIMIT 40 + static int noinline print_infinite_recursion_bug(void) { __raw_spin_unlock(&hash_lock); @@ -586,7 +595,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); /* * Check this lock's dependency list: @@ -636,7 +645,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_forwards_checks); @@ -675,7 +684,7 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) if (depth > max_recursion_depth) max_recursion_depth = depth; - if (depth >= 20) + if (depth >= RECURSION_LIMIT) return print_infinite_recursion_bug(); debug_atomic_inc(&nr_find_usage_backwards_checks); @@ -712,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\n======================================================\n"); printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); + print_kernel_version(); printk( "------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, curr->pid, @@ -793,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, printk("\n=============================================\n"); printk( "[ INFO: possible recursive locking detected ]\n"); + print_kernel_version(); printk( "---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, curr->pid); @@ -953,14 +964,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, &prev->class->locks_after, next->acquire_ip); if (!ret) return 0; - /* - * Return value of 2 signals 'dependency already added', - * in that case we dont have to add the backlink either. - */ - if (ret == 2) - return 2; + ret = add_lock_to_list(next->class, prev->class, &next->class->locks_before, next->acquire_ip); + if (!ret) + return 0; /* * Debugging printouts: @@ -1012,7 +1020,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2) { - check_prev_add(curr, hlock, next); + if (!check_prev_add(curr, hlock, next)) + return 0; /* * Stop after the first non-trylock entry, * as non-trylock entries have added their @@ -1068,7 +1077,8 @@ static int static_obj(void *obj) */ for_each_possible_cpu(i) { start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); + end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + + per_cpu_offset(i); if ((addr >= start) && (addr < end)) return 1; @@ -1103,8 +1113,6 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); - /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1142,8 +1150,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * (or spin_lock_init()) call - which acts as the key. For static * locks we use the lock object itself as the key. */ - if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) - __error_too_big_MAX_LOCKDEP_SUBCLASSES(); + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); key = lock->key->subkeys + subclass; @@ -1166,11 +1173,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * itself, so actual lookup of the hash should be once per lock object. */ static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass) +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; + unsigned long flags; class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -1192,6 +1200,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); + raw_local_irq_save(flags); __raw_spin_lock(&hash_lock); /* * We have to do the hash-walk again, to avoid races @@ -1206,6 +1215,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); debug_locks_off(); printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); printk("turning off the locking correctness validator.\n"); @@ -1228,17 +1238,20 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass) if (verbose(class)) { __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) printk("#%d", class->name_version); printk("\n"); dump_stack(); + raw_local_irq_save(flags); __raw_spin_lock(&hash_lock); } out_unlock_set: __raw_spin_unlock(&hash_lock); + raw_local_irq_restore(flags); - if (!subclass) + if (!subclass || force) lock->class_cache = class; DEBUG_LOCKS_WARN_ON(class->subclass != subclass); @@ -1375,6 +1388,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, printk("\n=========================================================\n"); printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + print_kernel_version(); printk( "---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, curr->pid); @@ -1469,6 +1483,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, printk("\n=================================\n"); printk( "[ INFO: inconsistent lock state ]\n"); + print_kernel_version(); printk( "---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", @@ -1715,6 +1730,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, debug_atomic_dec(&nr_unused_locks); break; default: + __raw_spin_unlock(&hash_lock); debug_locks_off(); WARN_ON(1); return 0; @@ -1924,7 +1940,7 @@ void trace_softirqs_off(unsigned long ip) * Initialize a lock instance's lock-class mapping info: */ void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, int subclass) { if (unlikely(!debug_locks)) return; @@ -1944,6 +1960,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; lock->key = key; lock->class_cache = NULL; + if (subclass) + register_lock_class(lock, subclass, 1); } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -1982,7 +2000,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * Not cached yet or subclass? */ if (unlikely(!class)) { - class = register_lock_class(lock, subclass); + class = register_lock_class(lock, subclass, 0); if (!class) return 0; } @@ -2630,6 +2648,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); static void print_held_locks_bug(struct task_struct *curr) { diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index eab043c83bb2..8ce09bc4613d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -20,7 +20,7 @@ #define MAX_LOCKDEP_KEYS_BITS 11 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define MAX_LOCKDEP_CHAINS_BITS 13 +#define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) /* diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index f6e72eaab3fa..b554b40a4aa6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockdep_ops = { +static const struct seq_operations lockdep_ops = { .start = l_start, .next = l_next, .stop = l_stop, @@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file) return res; } -static struct file_operations proc_lockdep_operations = { +static const struct file_operations proc_lockdep_operations = { .open = lockdep_open, .read = seq_read, .llseek = seq_lseek, @@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file) return single_open(file, lockdep_stats_show, NULL); } -static struct file_operations proc_lockdep_stats_operations = { +static const struct file_operations proc_lockdep_stats_operations = { .open = lockdep_stats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/module.c b/kernel/module.c index b7fe6e840963..d9eae45d0145 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -34,10 +34,10 @@ #include <linux/err.h> #include <linux/vermagic.h> #include <linux/notifier.h> +#include <linux/sched.h> #include <linux/stop_machine.h> #include <linux/device.h> #include <linux/string.h> -#include <linux/sched.h> #include <linux/mutex.h> #include <linux/unwind.h> #include <asm/uaccess.h> @@ -87,6 +87,12 @@ static inline int strong_try_module_get(struct module *mod) return try_module_get(mod); } +static inline void add_taint_module(struct module *mod, unsigned flag) +{ + add_taint(flag); + mod->taints |= flag; +} + /* A thread that wants to hold a reference to a module only while it * is running can call ths to safely exit. * nfsd and lockd use this. @@ -784,6 +790,19 @@ static struct module_attribute refcnt = { .show = show_refcnt, }; +void module_put(struct module *module) +{ + if (module) { + unsigned int cpu = get_cpu(); + local_dec(&module->ref[cpu].count); + /* Maybe they're waiting for us to drop reference? */ + if (unlikely(!module_is_live(module))) + wake_up_process(module->waiter); + put_cpu(); + } +} +EXPORT_SYMBOL(module_put); + #else /* !CONFIG_MODULE_UNLOAD */ static void print_unload_info(struct seq_file *m, struct module *mod) { @@ -847,11 +866,10 @@ static int check_version(Elf_Shdr *sechdrs, return 0; } /* Not in module's version table. OK, but that taints the kernel. */ - if (!(tainted & TAINT_FORCED_MODULE)) { + if (!(tainted & TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); - add_taint(TAINT_FORCED_MODULE); - } + add_taint_module(mod, TAINT_FORCED_MODULE); return 1; } @@ -909,7 +927,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, unsigned long ret; const unsigned long *crc; - ret = __find_symbol(name, &owner, &crc, mod->license_gplok); + ret = __find_symbol(name, &owner, &crc, + !(mod->taints & TAINT_PROPRIETARY_MODULE)); if (ret) { /* use_module can fail due to OOM, or module unloading */ if (!check_version(sechdrs, versindex, name, mod, crc) || @@ -933,6 +952,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr, return sprintf(buf, "0x%lx\n", sattr->address); } +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs); +} + static void add_sect_attrs(struct module *mod, unsigned int nsect, char *secstrings, Elf_Shdr *sechdrs) { @@ -949,21 +977,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, + nloaded * sizeof(sect_attrs->attrs[0]), sizeof(sect_attrs->grp.attrs[0])); size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); - if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; sattr->address = sechdrs[i].sh_addr; - strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, - MODULE_SECT_NAME_LEN); + sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + GFP_KERNEL); + if (sattr->name == NULL) + goto out; + sect_attrs->nsections++; sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; @@ -979,7 +1012,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, mod->sect_attrs = sect_attrs; return; out: - kfree(sect_attrs); + free_sect_attrs(sect_attrs); } static void remove_sect_attrs(struct module *mod) @@ -989,13 +1022,13 @@ static void remove_sect_attrs(struct module *mod) &mod->sect_attrs->grp); /* We are positive that no one is using any sect attrs * at this point. Deallocate immediately. */ - kfree(mod->sect_attrs); + free_sect_attrs(mod->sect_attrs); mod->sect_attrs = NULL; } } - #else + static inline void add_sect_attrs(struct module *mod, unsigned int nsect, char *sectstrings, Elf_Shdr *sechdrs) { @@ -1066,22 +1099,35 @@ static int mod_sysfs_setup(struct module *mod, goto out; kobj_set_kset_s(&mod->mkobj, module_subsys); mod->mkobj.mod = mod; - err = kobject_register(&mod->mkobj.kobj); + + /* delay uevent until full sysfs population */ + kobject_init(&mod->mkobj.kobj); + err = kobject_add(&mod->mkobj.kobj); if (err) goto out; + mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); + if (!mod->drivers_dir) + goto out_unreg; + err = module_param_sysfs_setup(mod, kparam, num_params); if (err) - goto out_unreg; + goto out_unreg_drivers; err = module_add_modinfo_attrs(mod); if (err) - goto out_unreg; + goto out_unreg_param; + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_drivers: + kobject_unregister(mod->drivers_dir); +out_unreg_param: + module_param_sysfs_remove(mod); out_unreg: - kobject_unregister(&mod->mkobj.kobj); + kobject_del(&mod->mkobj.kobj); + kobject_put(&mod->mkobj.kobj); out: return err; } @@ -1090,6 +1136,7 @@ static void mod_kobject_remove(struct module *mod) { module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); + kobject_unregister(mod->drivers_dir); kobject_unregister(&mod->mkobj.kobj); } @@ -1320,11 +1367,11 @@ static void set_license(struct module *mod, const char *license) if (!license) license = "unspecified"; - mod->license_gplok = license_is_gpl_compatible(license); - if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { - printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", - mod->name, license); - add_taint(TAINT_PROPRIETARY_MODULE); + if (!license_is_gpl_compatible(license)) { + if (!(tainted & TAINT_PROPRIETARY_MODULE)) + printk(KERN_WARNING "%s: module license '%s' taints " + "kernel.\n", mod->name, license); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); } } @@ -1603,7 +1650,7 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - add_taint(TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE); printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1700,7 +1747,7 @@ static struct module *load_module(void __user *umod, if (strcmp(mod->name, "ndiswrapper") == 0) add_taint(TAINT_PROPRIETARY_MODULE); if (strcmp(mod->name, "driverloader") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1745,7 +1792,7 @@ static struct module *load_module(void __user *umod, (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); - add_taint(TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE); } #endif @@ -2018,7 +2065,8 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry(mod, &modules, list) { if (within(addr, mod->module_init, mod->init_size) || within(addr, mod->module_core, mod->core_size)) { - *modname = mod->name; + if (modname) + *modname = mod->name; return get_ksymbol(mod, addr, size, offset); } } @@ -2109,9 +2157,33 @@ static void m_stop(struct seq_file *m, void *p) mutex_unlock(&module_mutex); } +static char *taint_flags(unsigned int taints, char *buf) +{ + int bx = 0; + + if (taints) { + buf[bx++] = '('; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx++] = ')'; + } + buf[bx] = '\0'; + + return buf; +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); + char buf[8]; + seq_printf(m, "%s %lu", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -2124,6 +2196,10 @@ static int m_show(struct seq_file *m, void *p) /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%p", mod->module_core); + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", taint_flags(mod->taints, buf)); + seq_printf(m, "\n"); return 0; } @@ -2133,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p) Where refcount is a number or -, and deps is a comma-separated list of depends or -. */ -struct seq_operations modules_op = { +const struct seq_operations modules_op = { .start = m_start, .next = m_next, .stop = m_stop, @@ -2216,20 +2292,24 @@ struct module *module_text_address(unsigned long addr) void print_modules(void) { struct module *mod; + char buf[8]; printk("Modules linked in:"); list_for_each_entry(mod, &modules, list) - printk(" %s", mod->name); + printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); printk("\n"); } void module_add_driver(struct module *mod, struct device_driver *drv) { + int no_warn; + if (!mod || !drv) return; - /* Don't check return code; this call is idempotent */ - sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + /* Don't check return codes; these calls are idempotent */ + no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name); } EXPORT_SYMBOL(module_add_driver); @@ -2238,6 +2318,8 @@ void module_remove_driver(struct device_driver *drv) if (!drv) return; sysfs_remove_link(&drv->kobj, "module"); + if (drv->owner && drv->owner->drivers_dir) + sysfs_remove_link(drv->owner->drivers_dir, drv->name); } EXPORT_SYMBOL(module_remove_driver); diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index e3203c654dda..841539d72c55 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, void debug_mutex_unlock(struct mutex *lock) { + if (unlikely(!debug_locks)) + return; + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); DEBUG_LOCKS_WARN_ON(lock->magic != lock); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); @@ -91,7 +94,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key); + lockdep_init_map(&lock->dep_map, name, key, 0); #endif lock->owner = NULL; lock->magic = lock; diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf72a497..e7cbbb82765b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); #endif /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c new file mode 100644 index 000000000000..e2ce748e96af --- /dev/null +++ b/kernel/nsproxy.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/nsproxy.h> +#include <linux/init_task.h> +#include <linux/mnt_namespace.h> +#include <linux/utsname.h> +#include <linux/pid_namespace.h> + +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); + +static inline void get_nsproxy(struct nsproxy *ns) +{ + atomic_inc(&ns->count); +} + +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + +/* + * creates a copy of "orig" with refcount 1. + * This does not grab references to the contained namespaces, + * so that needs to be done by dup_namespaces. + */ +static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns; + + ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); + if (ns) { + atomic_set(&ns->count, 1); + ns->id = -1; + } + return ns; +} + +/* + * copies the nsproxy, setting refcount to 1, and grabbing a + * reference to all contained namespaces. Called from + * sys_unshare() + */ +struct nsproxy *dup_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns = clone_namespaces(orig); + + if (ns) { + if (ns->mnt_ns) + get_mnt_ns(ns->mnt_ns); + if (ns->uts_ns) + get_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + get_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + get_pid_ns(ns->pid_ns); + } + + return ns; +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(int flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + + new_ns = clone_namespaces(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy = new_ns; + + err = copy_mnt_ns(flags, tsk); + if (err) + goto out_ns; + + err = copy_utsname(flags, tsk); + if (err) + goto out_uts; + + err = copy_ipcs(flags, tsk); + if (err) + goto out_ipc; + + err = copy_pid_ns(flags, tsk); + if (err) + goto out_pid; + +out: + put_nsproxy(old_ns); + return err; + +out_pid: + if (new_ns->ipc_ns) + put_ipc_ns(new_ns->ipc_ns); +out_ipc: + if (new_ns->uts_ns) + put_uts_ns(new_ns->uts_ns); +out_uts: + if (new_ns->mnt_ns) + put_mnt_ns(new_ns->mnt_ns); +out_ns: + tsk->nsproxy = old_ns; + kfree(new_ns); + goto out; +} + +void free_nsproxy(struct nsproxy *ns) +{ + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); + kfree(ns); +} diff --git a/kernel/panic.c b/kernel/panic.c index 6ceb664fb52a..525e365f7239 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -21,7 +21,6 @@ #include <linux/debug_locks.h> int panic_on_oops; -int panic_on_unrecovered_nmi; int tainted; static int pause_on_oops; static int pause_on_oops_flag; diff --git a/kernel/params.c b/kernel/params.c index 91aea7aa532e..f406655d6653 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name, unsigned int name_skip) { struct module_kobject *mk; + int ret; mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name, mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); kobject_set_name(&mk->kobj, name); - kobject_register(&mk->kobj); + ret = kobject_register(&mk->kobj); + BUG_ON(ret < 0); /* no need to keep the kobject if no parameter is exported */ if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { @@ -684,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL); */ static int __init param_sysfs_init(void) { - subsystem_register(&module_subsys); + int ret; + + ret = subsystem_register(&module_subsys); + if (ret < 0) { + printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", + __FILE__, __LINE__, ret); + return ret; + } param_sysfs_builtin(); return 0; } -__initcall(param_sysfs_init); +subsys_initcall(param_sysfs_init); EXPORT_SYMBOL(param_set_byte); EXPORT_SYMBOL(param_get_byte); diff --git a/kernel/pid.c b/kernel/pid.c index 93e212f20671..2efe9d8d367b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,24 +26,29 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/hash.h> +#include <linux/pid_namespace.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; static int pidhash_shift; -static kmem_cache_t *pid_cachep; +static struct kmem_cache *pid_cachep; int pid_max = PID_MAX_DEFAULT; -int last_pid; #define RESERVED_PIDS 300 int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) + +static inline int mk_pid(struct pid_namespace *pid_ns, + struct pidmap *map, int off) +{ + return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; +} + #define find_next_offset(map, off) \ find_next_zero_bit((map)->page, BITS_PER_PAGE, off) @@ -53,13 +58,16 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -typedef struct pidmap { - atomic_t nr_free; - void *page; -} pidmap_t; - -static pidmap_t pidmap_array[PIDMAP_ENTRIES] = - { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; +struct pid_namespace init_pid_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0, + .child_reaper = &init_task +}; /* * Note: disable interrupts while the pidmap_lock is held as an @@ -74,40 +82,41 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] = * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(int pid) +static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) { - pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(void) +static int alloc_pidmap(struct pid_namespace *pid_ns) { - int i, offset, max_scan, pid, last = last_pid; - pidmap_t *map; + int i, offset, max_scan, pid, last = pid_ns->last_pid; + struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pidmap_array[pid/BITS_PER_PAGE]; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* * Free the page if someone raced with us * installing it: */ spin_lock_irq(&pidmap_lock); if (map->page) - free_page(page); + kfree(page); else - map->page = (void *)page; + map->page = page; spin_unlock_irq(&pidmap_lock); if (unlikely(!map->page)) break; @@ -116,11 +125,11 @@ static int alloc_pidmap(void) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - last_pid = pid; + pid_ns->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(map, offset); + pid = mk_pid(pid_ns, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -131,16 +140,34 @@ static int alloc_pidmap(void) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pidmap_array[0]; + map = &pid_ns->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(map, offset); + pid = mk_pid(pid_ns, map, offset); + } + return -1; +} + +static int next_pidmap(struct pid_namespace *pid_ns, int last) +{ + int offset; + struct pidmap *map, *end; + + offset = (last + 1) & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pid_ns->pidmap[PIDMAP_ENTRIES]; + for (; map < end; map++, offset = 0) { + if (unlikely(!map->page)) + continue; + offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); + if (offset < BITS_PER_PAGE) + return mk_pid(pid_ns, map, offset); } return -1; } @@ -153,6 +180,7 @@ fastcall void put_pid(struct pid *pid) atomic_dec_and_test(&pid->count)) kmem_cache_free(pid_cachep, pid); } +EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { @@ -169,7 +197,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(pid->nr); + free_pidmap(current->nsproxy->pid_ns, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -183,7 +211,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(); + nr = alloc_pidmap(current->nsproxy->pid_ns); if (nr < 0) goto out_free; @@ -217,15 +245,13 @@ struct pid * fastcall find_pid(int nr) } return NULL; } +EXPORT_SYMBOL_GPL(find_pid); int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { struct pid_link *link; struct pid *pid; - WARN_ON(!task->pid); /* to be removed soon */ - WARN_ON(!nr); /* to be removed soon */ - link = &task->pids[type]; link->pid = pid = find_pid(nr); hlist_add_head_rcu(&link->node, &pid->tasks[type]); @@ -252,6 +278,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type) free_pid(pid); } +/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ +void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, + enum pid_type type) +{ + new->pids[type].pid = old->pids[type].pid; + hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + old->pids[type].pid = NULL; +} + struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; @@ -274,6 +309,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); +struct pid *get_task_pid(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + rcu_read_lock(); + pid = get_pid(task->pids[type].pid); + rcu_read_unlock(); + return pid; +} + struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; @@ -297,6 +341,46 @@ struct pid *find_get_pid(pid_t nr) } /* + * Used by proc to find the first pid that is greater then or equal to nr. + * + * If there is a pid at nr this function is exactly the same as find_pid. + */ +struct pid *find_ge_pid(int nr) +{ + struct pid *pid; + + do { + pid = find_pid(nr); + if (pid) + break; + nr = next_pidmap(current->nsproxy->pid_ns, nr); + } while (nr > 0); + + return pid; +} +EXPORT_SYMBOL_GPL(find_get_pid); + +int copy_pid_ns(int flags, struct task_struct *tsk) +{ + struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_pid_ns(old_ns); + return err; +} + +void free_pid_ns(struct kref *kref) +{ + struct pid_namespace *ns; + + ns = container_of(kref, struct pid_namespace, kref); + kfree(ns); +} + +/* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. @@ -323,10 +407,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, pidmap_array->page); - atomic_dec(&pidmap_array->nr_free); + set_bit(0, init_pid_ns.pidmap[0].page); + atomic_dec(&init_pid_ns.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index d38d9ec3276c..7c3e1e6dfb5b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -88,6 +88,19 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, } /* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + +/* * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ @@ -483,8 +496,8 @@ static void process_timer_rebalance(struct task_struct *p, BUG(); break; case CPUCLOCK_PROF: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(prof_ticks(t), left); @@ -498,8 +511,8 @@ static void process_timer_rebalance(struct task_struct *p, } while (t != p); break; case CPUCLOCK_VIRT: - left = cputime_div(cputime_sub(expires.cpu, val.cpu), - nthreads); + left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), + nthreads); do { if (likely(!(t->flags & PF_EXITING))) { ticks = cputime_add(virt_ticks(t), left); @@ -515,6 +528,7 @@ static void process_timer_rebalance(struct task_struct *p, case CPUCLOCK_SCHED: nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); + nsleft = max_t(unsigned long long, nsleft, 1); do { if (likely(!(t->flags & PF_EXITING))) { ns = t->sched_time + nsleft; @@ -1159,12 +1173,13 @@ static void check_process_timers(struct task_struct *tsk, prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div(prof_left, nthreads); + prof_left = cputime_div_non_zero(prof_left, nthreads); virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div(virt_left, nthreads); + virt_left = cputime_div_non_zero(virt_left, nthreads); if (sched_expires) { sched_left = sched_expires - sched_time; do_div(sched_left, nthreads); + sched_left = max_t(unsigned long long, sched_left, 1); } else { sched_left = 0; } @@ -1393,25 +1408,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } } -static long posix_cpu_clock_nanosleep_restart(struct restart_block *); - -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct itimerspec *it) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; struct k_itimer timer; int error; /* - * Diagnose required errors first. - */ - if (CPUCLOCK_PERTHREAD(which_clock) && - (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) - return -EINVAL; - - /* * Set up a temporary timer and then wait for it to go off. */ memset(&timer, 0, sizeof timer); @@ -1422,11 +1425,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec zero_it; - struct itimerspec it = { .it_value = *rqtp, - .it_interval = {} }; + + memset(it, 0, sizeof *it); + it->it_value = *rqtp; spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, &it, NULL); + error = posix_cpu_timer_set(&timer, flags, it, NULL); if (error) { spin_unlock_irq(&timer.it_lock); return error; @@ -1454,49 +1458,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, &it); + posix_cpu_timer_set(&timer, 0, &zero_it, it); spin_unlock_irq(&timer.it_lock); - if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { /* * It actually did fire already. */ return 0; } + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) +{ + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + struct itimerspec it; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && !(flags & TIMER_ABSTIME) && - copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_clock_nanosleep_restart; - /* Caller already set restart_block->arg1 */ + restart_block->fn = posix_cpu_nsleep_restart; restart_block->arg0 = which_clock; restart_block->arg1 = (unsigned long) rmtp; restart_block->arg2 = rqtp->tv_sec; restart_block->arg3 = rqtp->tv_nsec; - - error = -ERESTART_RESTARTBLOCK; } - return error; } -static long -posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) +long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->arg0; struct timespec __user *rmtp; struct timespec t; + struct itimerspec it; + int error; rmtp = (struct timespec __user *) restart_block->arg1; t.tv_sec = restart_block->arg2; t.tv_nsec = restart_block->arg3; restart_block->fn = do_no_restart_syscall; - return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); + error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + /* + * Report back to the user the time still remaining. + */ + if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_nsleep_restart; + restart_block->arg0 = which_clock; + restart_block->arg1 = (unsigned long) rmtp; + restart_block->arg2 = t.tv_sec; + restart_block->arg3 = t.tv_nsec; + } + return error; + } @@ -1524,6 +1568,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags, { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); } +static long process_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} static int thread_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -1544,6 +1592,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags, { return -EINVAL; } +static long thread_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} static __init int init_posix_cpu_timers(void) { @@ -1553,6 +1605,7 @@ static __init int init_posix_cpu_timers(void) .clock_set = do_posix_clock_nosettime, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { .clock_getres = thread_cpu_clock_getres, @@ -1560,6 +1613,7 @@ static __init int init_posix_cpu_timers(void) .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, .nsleep = thread_cpu_nsleep, + .nsleep_restart = thread_cpu_nsleep_restart, }; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ac6dc8744429..5fe87de10ff0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1,5 +1,5 @@ /* - * linux/kernel/posix_timers.c + * linux/kernel/posix-timers.c * * * 2002-10-15 Posix Clocks & timers @@ -70,7 +70,7 @@ /* * Lets keep our timers in a slab cache :-) */ -static kmem_cache_t *posix_timers_cache; +static struct kmem_cache *posix_timers_cache; static struct idr posix_timers_id; static DEFINE_SPINLOCK(idr_lock); @@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags, return CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t, rmtp)); } + +/* + * nanosleep_restart for monotonic and realtime clocks + */ +static int common_nsleep_restart(struct restart_block *restart_block) +{ + return hrtimer_nanosleep_restart(restart_block); +} + +/* + * This will restart clock_nanosleep. This is required only by + * compat_clock_nanosleep_restart for now. + */ +long +clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->arg0; + + return CLOCK_DISPATCH(which_clock, nsleep_restart, + (restart_block)); +} diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 825068ca3479..710ed084e7c5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED config SOFTWARE_SUSPEND bool "Software Suspend" - depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) + depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need ACPI or APM. diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d72234942798..0b00f56c2ad0 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -18,7 +18,9 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/pm.h> +#include <linux/console.h> #include <linux/cpu.h> +#include <linux/freezer.h> #include "power.h" @@ -26,6 +28,23 @@ static int noresume = 0; char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; +sector_t swsusp_resume_block; + +/** + * platform_prepare - prepare the machine for hibernation using the + * platform driver if so configured and return an error code if it fails + */ + +static inline int platform_prepare(void) +{ + int error = 0; + + if (pm_disk_mode == PM_DISK_PLATFORM) { + if (pm_ops && pm_ops->prepare) + error = pm_ops->prepare(PM_SUSPEND_DISK); + } + return error; +} /** * power_down - Shut machine down for hibernate. @@ -39,12 +58,10 @@ dev_t swsusp_resume_device; static void power_down(suspend_disk_method_t mode) { - int error = 0; - switch(mode) { case PM_DISK_PLATFORM: kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); + pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: kernel_power_off(); @@ -70,7 +87,7 @@ static inline void platform_finish(void) static int prepare_processes(void) { - int error; + int error = 0; pm_prepare_console(); @@ -83,12 +100,24 @@ static int prepare_processes(void) goto thaw; } + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto thaw; + } + + error = platform_prepare(); + if (error) + goto thaw; + /* Free memory before shutting down devices. */ if (!(error = swsusp_shrink_memory())) return 0; -thaw: + + platform_finish(); + thaw: thaw_processes(); -enable_cpus: + enable_cpus: enable_nonboot_cpus(); pm_restore_console(); return error; @@ -119,11 +148,21 @@ int pm_suspend_disk(void) if (error) return error; + if (pm_disk_mode == PM_DISK_TESTPROC) + return 0; + + suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { + resume_console(); printk("Some devices failed to suspend\n"); - unprepare_processes(); - return error; + goto Thaw; + } + + if (pm_disk_mode == PM_DISK_TEST) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Done; } pr_debug("PM: snapshotting memory.\n"); @@ -133,21 +172,24 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); + resume_console(); pr_debug("PM: writing image.\n"); error = swsusp_write(); if (!error) power_down(pm_disk_mode); else { swsusp_free(); - unprepare_processes(); - return error; + goto Thaw; } - } else + } else { pr_debug("PM: Image restored successfully.\n"); + } swsusp_free(); Done: device_resume(); + resume_console(); + Thaw: unprepare_processes(); return error; } @@ -169,10 +211,10 @@ static int software_resume(void) { int error; - down(&pm_sem); + mutex_lock(&pm_mutex); if (!swsusp_resume_device) { if (!strlen(resume_file)) { - up(&pm_sem); + mutex_unlock(&pm_mutex); return -ENOENT; } swsusp_resume_device = name_to_dev_t(resume_file); @@ -187,7 +229,7 @@ static int software_resume(void) * FIXME: If noresume is specified, we need to find the partition * and reset it back to normal swap space. */ - up(&pm_sem); + mutex_unlock(&pm_mutex); return 0; } @@ -212,7 +254,9 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); + suspend_console(); if ((error = device_suspend(PMSG_PRETHAW))) { + resume_console(); printk("Some devices failed to suspend\n"); swsusp_free(); goto Thaw; @@ -224,11 +268,12 @@ static int software_resume(void) swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); device_resume(); + resume_console(); Thaw: unprepare_processes(); Done: /* For success case, the suspend path will release the lock */ - up(&pm_sem); + mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return 0; } @@ -241,6 +286,8 @@ static const char * const pm_disk_modes[] = { [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", [PM_DISK_REBOOT] = "reboot", + [PM_DISK_TEST] = "test", + [PM_DISK_TESTPROC] = "testproc", }; /** @@ -287,7 +334,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) p = memchr(buf, '\n', n); len = p ? p - buf : n; - down(&pm_sem); + mutex_lock(&pm_mutex); for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { if (!strncmp(buf, pm_disk_modes[i], len)) { mode = i; @@ -295,21 +342,23 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) } } if (mode) { - if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT) + if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT || + mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) { pm_disk_mode = mode; - else { + } else { if (pm_ops && pm_ops->enter && (mode == pm_ops->pm_disk_mode)) pm_disk_mode = mode; else error = -EINVAL; } - } else + } else { error = -EINVAL; + } pr_debug("PM: suspend-to-disk mode set to '%s'\n", pm_disk_modes[mode]); - up(&pm_sem); + mutex_unlock(&pm_mutex); return error ? error : n; } @@ -334,14 +383,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) if (maj != MAJOR(res) || min != MINOR(res)) goto out; - down(&pm_sem); + mutex_lock(&pm_mutex); swsusp_resume_device = res; - up(&pm_sem); + mutex_unlock(&pm_mutex); printk("Attempting manual resume\n"); noresume = 0; software_resume(); ret = n; -out: + out: return ret; } @@ -396,6 +445,19 @@ static int __init resume_setup(char *str) return 1; } +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + static int __init noresume_setup(char *str) { noresume = 1; @@ -403,4 +465,5 @@ static int __init noresume_setup(char *str) } __setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 873228c71dab..500eb87f643d 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ * */ +#include <linux/module.h> #include <linux/suspend.h> #include <linux/kobject.h> #include <linux/string.h> @@ -18,13 +19,14 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/resume-trace.h> +#include <linux/freezer.h> #include "power.h" /*This is just an arbitrary number */ #define FREE_PAGE_NUMBER (100) -DECLARE_MUTEX(pm_sem); +DEFINE_MUTEX(pm_mutex); struct pm_ops *pm_ops; suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; @@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; void pm_set_ops(struct pm_ops * ops) { - down(&pm_sem); + mutex_lock(&pm_mutex); pm_ops = ops; - up(&pm_sem); + mutex_unlock(&pm_mutex); } @@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state) if (!valid_state(state)) return -ENODEV; - if (down_trylock(&pm_sem)) + if (!mutex_trylock(&pm_mutex)) return -EBUSY; if (state == PM_SUSPEND_DISK) { @@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state) pr_debug("PM: Finishing wakeup.\n"); suspend_finish(state); Unlock: - up(&pm_sem); + mutex_unlock(&pm_mutex); return error; } @@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state) return -EINVAL; } - +EXPORT_SYMBOL(pm_suspend); decl_subsys(power,NULL,NULL); diff --git a/kernel/power/power.h b/kernel/power/power.h index bfe999f7b272..eb461b816bf4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void) return -EPERM; } #endif -extern struct semaphore pm_sem; + +extern struct mutex pm_mutex; + #define power_attr(_name) \ static struct subsys_attribute _name##_attr = { \ .attr = { \ @@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end; extern unsigned long image_size; extern int in_suspend; extern dev_t swsusp_resume_device; +extern sector_t swsusp_resume_block; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); @@ -102,8 +105,18 @@ struct snapshot_handle { extern unsigned int snapshot_additional_pages(struct zone *zone); extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -extern void snapshot_free_unused_memory(struct snapshot_handle *handle); + +/* + * This structure is used to pass the values needed for the identification + * of the resume swap area from a user space to the kernel via the + * SNAPSHOT_SET_SWAP_AREA ioctl + */ +struct resume_swap_area { + loff_t offset; + u_int32_t dev; +} __attribute__((packed)); #define SNAPSHOT_IOC_MAGIC '3' #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) @@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle); #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) -#define SNAPSHOT_IOC_MAXNR 11 +#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) +#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \ + struct resume_swap_area) +#define SNAPSHOT_IOC_MAXNR 13 + +#define PMOPS_PREPARE 1 +#define PMOPS_ENTER 2 +#define PMOPS_FINISH 3 /** * The bitmap is used for tracing allocated swap pages @@ -141,7 +161,7 @@ struct bitmap_page { extern void free_bitmap(struct bitmap_page *bitmap); extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); -extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); +extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); extern int swsusp_check(void); @@ -153,3 +173,7 @@ extern int swsusp_read(void); extern int swsusp_write(void); extern void swsusp_close(void); extern int suspend_enter(suspend_state_t state); + +struct timeval; +extern void swsusp_show_speed(struct timeval *, struct timeval *, + unsigned int, char *); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7a4144ba3afd..678ec736076b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -16,15 +16,14 @@ * callback we use. */ -static void do_poweroff(void *dummy) +static void do_poweroff(struct work_struct *dummy) { kernel_power_off(); } -static DECLARE_WORK(poweroff_work, do_poweroff, NULL); +static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key, struct pt_regs *pt_regs, - struct tty_struct *tty) +static void handle_poweroff(int key, struct tty_struct *tty) { schedule_work(&poweroff_work); } diff --git a/kernel/power/process.c b/kernel/power/process.c index 72e72d2c61e6..99eeb119b06d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -13,12 +13,15 @@ #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/freezer.h> /* * Timeout for stopping processes */ #define TIMEOUT (20 * HZ) +#define FREEZER_KERNEL_THREADS 0 +#define FREEZER_USER_SPACE 1 static inline int freezeable(struct task_struct * p) { @@ -39,7 +42,6 @@ void refrigerator(void) long save; save = current->state; pr_debug("%s entered refrigerator\n", current->comm); - printk("="); frozen_process(current); spin_lock_irq(¤t->sighand->siglock); @@ -79,96 +81,136 @@ static void cancel_freezing(struct task_struct *p) } } -/* 0 = success, else # of processes that we failed to stop */ -int freeze_processes(void) +static inline int is_user_space(struct task_struct *p) +{ + return p->mm && !(p->flags & PF_BORROWED_MM); +} + +static unsigned int try_to_freeze_tasks(int freeze_user_space) { - int todo, nr_user, user_frozen; - unsigned long start_time; struct task_struct *g, *p; + unsigned long end_time; + unsigned int todo; - printk( "Stopping tasks: " ); - start_time = jiffies; - user_frozen = 0; + end_time = jiffies + TIMEOUT; do { - nr_user = todo = 0; + todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezeable(p)) continue; + if (frozen(p)) continue; - if (p->state == TASK_TRACED && frozen(p->parent)) { + + if (p->state == TASK_TRACED && + (frozen(p->parent) || + p->parent->state == TASK_STOPPED)) { cancel_freezing(p); continue; } - if (p->mm && !(p->flags & PF_BORROWED_MM)) { - /* The task is a user-space one. - * Freeze it unless there's a vfork completion - * pending + if (is_user_space(p)) { + if (!freeze_user_space) + continue; + + /* Freeze the task unless there is a vfork + * completion pending */ if (!p->vfork_done) freeze_process(p); - nr_user++; } else { - /* Freeze only if the user space is frozen */ - if (user_frozen) - freeze_process(p); - todo++; + if (freeze_user_space) + continue; + + freeze_process(p); } + todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); - todo += nr_user; - if (!user_frozen && !nr_user) { - sys_sync(); - start_time = jiffies; - } - user_frozen = !nr_user; yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) + if (todo && time_after(jiffies, end_time)) break; - } while(todo); + } while (todo); - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ if (todo) { - printk( "\n" ); - printk(KERN_ERR " stopping tasks timed out " - "after %d seconds (%d tasks remaining):\n", - TIMEOUT / HZ, todo); + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ + printk("\n"); + printk(KERN_ERR "Stopping %s timed out after %d seconds " + "(%d tasks refusing to freeze):\n", + freeze_user_space ? "user space processes" : + "kernel threads", + TIMEOUT / HZ, todo); read_lock(&tasklist_lock); do_each_thread(g, p) { + if (is_user_space(p) == !freeze_user_space) + continue; + if (freezeable(p) && !frozen(p)) - printk(KERN_ERR " %s\n", p->comm); + printk(KERN_ERR " %s\n", p->comm); + cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); - return todo; } - printk( "|\n" ); + return todo; +} + +/** + * freeze_processes - tell processes to enter the refrigerator + * + * Returns 0 on success, or the number of processes that didn't freeze, + * although they were told to. + */ +int freeze_processes(void) +{ + unsigned int nr_unfrozen; + + printk("Stopping tasks ... "); + nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); + if (nr_unfrozen) + return nr_unfrozen; + + sys_sync(); + nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + if (nr_unfrozen) + return nr_unfrozen; + + printk("done.\n"); BUG_ON(in_atomic()); return 0; } -void thaw_processes(void) +static void thaw_tasks(int thaw_user_space) { struct task_struct *g, *p; - printk( "Restarting tasks..." ); read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezeable(p)) continue; + + if (is_user_space(p) == !thaw_user_space) + continue; + if (!thaw_process(p)) - printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); + printk(KERN_WARNING " Strange, %s not stopped\n", + p->comm ); } while_each_thread(g, p); - read_unlock(&tasklist_lock); +} + +void thaw_processes(void) +{ + printk("Restarting tasks ... "); + thaw_tasks(FREEZER_KERNEL_THREADS); + thaw_tasks(FREEZER_USER_SPACE); schedule(); - printk( " done\n" ); + printk("done.\n"); } EXPORT_SYMBOL(refrigerator); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1b84313cbab5..c024606221c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,15 +1,15 @@ /* * linux/kernel/power/snapshot.c * - * This file provide system snapshot/restore functionality. + * This file provides system snapshot/restore functionality for swsusp. * * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * - * This file is released under the GPLv2, and is based on swsusp.c. + * This file is released under the GPLv2. * */ - #include <linux/version.h> #include <linux/module.h> #include <linux/mm.h> @@ -34,137 +34,24 @@ #include "power.h" -/* List of PBEs used for creating and restoring the suspend image */ +/* List of PBEs needed for restoring the pages that were allocated before + * the suspend and included in the suspend image, but have also been + * allocated by the "resume" kernel, so their contents cannot be written + * directly to their "original" page frames. + */ struct pbe *restore_pblist; -static unsigned int nr_copy_pages; -static unsigned int nr_meta_pages; +/* Pointer to an auxiliary buffer (1 page) */ static void *buffer; -#ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - unsigned int n = 0; - - for_each_zone (zone) - if (is_highmem(zone)) { - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { - struct page *page; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - if (PageReserved(page)) - continue; - if (PageNosaveFree(page)) - continue; - n++; - } - } - return n; -} - -struct highmem_page { - char *data; - struct page *page; - struct highmem_page *next; -}; - -static struct highmem_page *highmem_copy; - -static int save_highmem_zone(struct zone *zone) -{ - unsigned long zone_pfn; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - struct page *page; - struct highmem_page *save; - void *kaddr; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - - if (!(pfn%10000)) - printk("."); - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. - */ - if (PageReserved(page)) - continue; - BUG_ON(PageNosave(page)); - if (PageNosaveFree(page)) - continue; - save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); - if (!save) - return -ENOMEM; - save->next = highmem_copy; - save->page = page; - save->data = (void *) get_zeroed_page(GFP_ATOMIC); - if (!save->data) { - kfree(save); - return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); - memcpy(save->data, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - highmem_copy = save; - } - return 0; -} - -int save_highmem(void) -{ - struct zone *zone; - int res = 0; - - pr_debug("swsusp: Saving Highmem"); - drain_local_pages(); - for_each_zone (zone) { - if (is_highmem(zone)) - res = save_highmem_zone(zone); - if (res) - return res; - } - printk("\n"); - return 0; -} - -int restore_highmem(void) -{ - printk("swsusp: Restoring Highmem\n"); - while (highmem_copy) { - struct highmem_page *save = highmem_copy; - void *kaddr; - highmem_copy = save->next; - - kaddr = kmap_atomic(save->page, KM_USER0); - memcpy(kaddr, save->data, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - free_page((long) save->data); - kfree(save); - } - return 0; -} -#else -static inline unsigned int count_highmem_pages(void) {return 0;} -static inline int save_highmem(void) {return 0;} -static inline int restore_highmem(void) {return 0;} -#endif - /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * used before suspend. + * used before suspend. The unsafe pages have PageNosaveFree set + * and we count them using unsafe_pages. * - * The unsafe pages are marked with the PG_nosave_free flag - * and we count them using unsafe_pages + * Each allocated image page is marked as PageNosave and PageNosaveFree + * so that swsusp_free() can release it. */ #define PG_ANY 0 @@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;} static unsigned int allocated_unsafe_pages; -static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); + return (unsigned long)get_image_page(gfp_mask, PG_SAFE); +} + +static struct page *alloc_image_page(gfp_t gfp_mask) +{ + struct page *page; + + page = alloc_page(gfp_mask); + if (page) { + SetPageNosave(page); + SetPageNosaveFree(page); + } + return page; } /** * free_image_page - free page represented by @addr, allocated with - * alloc_image_page (page flags set by it must be cleared) + * get_image_page (page flags set by it must be cleared) */ static inline void free_image_page(void *addr, int clear_nosave_free) { - ClearPageNosave(virt_to_page(addr)); + struct page *page; + + BUG_ON(!virt_addr_valid(addr)); + + page = virt_to_page(addr); + + ClearPageNosave(page); if (clear_nosave_free) - ClearPageNosaveFree(virt_to_page(addr)); - free_page((unsigned long)addr); + ClearPageNosaveFree(page); + + __free_page(page); } /* struct linked_page is used to build chains of pages */ @@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); + lp = get_image_page(ca->gfp_mask, ca->safe_needed); if (!lp) return NULL; @@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) /* Compute the number of zones */ nr = 0; - for_each_zone (zone) - if (populated_zone(zone) && !is_highmem(zone)) + for_each_zone(zone) + if (populated_zone(zone)) nr++; /* Allocate the list of zones bitmap objects */ @@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } /* Initialize the zone bitmap objects */ - for_each_zone (zone) { + for_each_zone(zone) { unsigned long pfn; - if (!populated_zone(zone) || is_highmem(zone)) + if (!populated_zone(zone)) continue; zone_bm->start_pfn = zone->zone_start_pfn; @@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) while (bb) { unsigned long *ptr; - ptr = alloc_image_page(gfp_mask, safe_needed); + ptr = get_image_page(gfp_mask, safe_needed); bb->data = ptr; if (!ptr) goto Free; @@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) memory_bm_position_reset(bm); return 0; -Free: + Free: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); return -ENOMEM; @@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) memory_bm_position_reset(bm); return BM_END_OF_MAP; -Return_pfn: + Return_pfn: bm->cur.chunk = chunk; bm->cur.bit = bit; return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; @@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone) res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); - return res; + return 2 * res; +} + +#ifdef CONFIG_HIGHMEM +/** + * count_free_highmem_pages - compute the total number of free highmem + * pages, system-wide. + */ + +static unsigned int count_free_highmem_pages(void) +{ + struct zone *zone; + unsigned int cnt = 0; + + for_each_zone(zone) + if (populated_zone(zone) && is_highmem(zone)) + cnt += zone->free_pages; + + return cnt; +} + +/** + * saveable_highmem_page - Determine whether a highmem page should be + * included in the suspend image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't a part of a free chunk of pages. + */ + +static struct page *saveable_highmem_page(unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + + BUG_ON(!PageHighMem(page)); + + if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page)) + return NULL; + + return page; } /** + * count_highmem_pages - compute the total number of saveable highmem + * pages. + */ + +unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned int n = 0; + + for_each_zone(zone) { + unsigned long pfn, max_zone_pfn; + + if (!is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_highmem_page(pfn)) + n++; + } + return n; +} +#else +static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline unsigned int count_highmem_pages(void) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** * pfn_is_nosave - check if given pfn is in the 'nosave' section */ @@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn) } /** - * saveable - Determine whether a page should be cloned or not. - * @pfn: The page + * saveable - Determine whether a non-highmem page should be included in + * the suspend image. * - * We save a page if it isn't Nosave, and is not in the range of pages - * statically defined as 'unsaveable', and it - * isn't a part of a free chunk of pages. + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't a part of + * a free chunk of pages. */ static struct page *saveable_page(unsigned long pfn) @@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn) page = pfn_to_page(pfn); - if (PageNosave(page)) + BUG_ON(PageHighMem(page)); + + if (PageNosave(page) || PageNosaveFree(page)) return NULL; + if (PageReserved(page) && pfn_is_nosave(pfn)) return NULL; - if (PageNosaveFree(page)) - return NULL; return page; } +/** + * count_data_pages - compute the total number of saveable non-highmem + * pages. + */ + unsigned int count_data_pages(void) { struct zone *zone; unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone (zone) { + for_each_zone(zone) { if (is_highmem(zone)) continue; + mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - n += !!saveable_page(pfn); + if(saveable_page(pfn)) + n++; } return n; } -static inline void copy_data_page(long *dst, long *src) +/* This is needed, because copy_page and memcpy are not usable for copying + * task structs. + */ +static inline void do_copy_page(long *dst, long *src) { int n; - /* copy_page and memcpy are not usable for copying task structs. */ for (n = PAGE_SIZE / sizeof(long); n; n--) *dst++ = *src++; } +#ifdef CONFIG_HIGHMEM +static inline struct page * +page_is_saveable(struct zone *zone, unsigned long pfn) +{ + return is_highmem(zone) ? + saveable_highmem_page(pfn) : saveable_page(pfn); +} + +static inline void +copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + struct page *s_page, *d_page; + void *src, *dst; + + s_page = pfn_to_page(src_pfn); + d_page = pfn_to_page(dst_pfn); + if (PageHighMem(s_page)) { + src = kmap_atomic(s_page, KM_USER0); + dst = kmap_atomic(d_page, KM_USER1); + do_copy_page(dst, src); + kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst, KM_USER1); + } else { + src = page_address(s_page); + if (PageHighMem(d_page)) { + /* Page pointed to by src may contain some kernel + * data modified by kmap_atomic() + */ + do_copy_page(buffer, src); + dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + memcpy(dst, buffer, PAGE_SIZE); + kunmap_atomic(dst, KM_USER0); + } else { + dst = page_address(d_page); + do_copy_page(dst, src); + } + } +} +#else +#define page_is_saveable(zone, pfn) saveable_page(pfn) + +static inline void +copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + do_copy_page(page_address(pfn_to_page(dst_pfn)), + page_address(pfn_to_page(src_pfn))); +} +#endif /* CONFIG_HIGHMEM */ + static void copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) { struct zone *zone; unsigned long pfn; - for_each_zone (zone) { + for_each_zone(zone) { unsigned long max_zone_pfn; - if (is_highmem(zone)) - continue; - mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_page(pfn)) + if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); do { pfn = memory_bm_next_pfn(orig_bm); - if (likely(pfn != BM_END_OF_MAP)) { - struct page *page; - void *src; - - page = pfn_to_page(pfn); - src = page_address(page); - page = pfn_to_page(memory_bm_next_pfn(copy_bm)); - copy_data_page(page_address(page), src); - } + if (likely(pfn != BM_END_OF_MAP)) + copy_data_page(memory_bm_next_pfn(copy_bm), pfn); } while (pfn != BM_END_OF_MAP); } +/* Total number of image pages */ +static unsigned int nr_copy_pages; +/* Number of pages needed for saving the original pfns of the image pages */ +static unsigned int nr_meta_pages; + /** * swsusp_free - free pages allocated for the suspend. * @@ -792,7 +824,7 @@ void swsusp_free(void) if (PageNosave(page) && PageNosaveFree(page)) { ClearPageNosave(page); ClearPageNosaveFree(page); - free_page((long) page_address(page)); + __free_page(page); } } } @@ -802,34 +834,108 @@ void swsusp_free(void) buffer = NULL; } +#ifdef CONFIG_HIGHMEM +/** + * count_pages_for_highmem - compute the number of non-highmem pages + * that will be necessary for creating copies of highmem pages. + */ + +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) +{ + unsigned int free_highmem = count_free_highmem_pages(); + + if (free_highmem >= nr_highmem) + nr_highmem = 0; + else + nr_highmem -= free_highmem; + + return nr_highmem; +} +#else +static unsigned int +count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +#endif /* CONFIG_HIGHMEM */ /** - * enough_free_mem - Make sure we enough free memory to snapshot. - * - * Returns TRUE or FALSE after checking the number of available - * free pages. + * enough_free_mem - Make sure we have enough free memory for the + * snapshot image. */ -static int enough_free_mem(unsigned int nr_pages) +static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; unsigned int free = 0, meta = 0; - for_each_zone (zone) - if (!is_highmem(zone)) { + for_each_zone(zone) { + meta += snapshot_additional_pages(zone); + if (!is_highmem(zone)) free += zone->free_pages; - meta += snapshot_additional_pages(zone); - } + } - pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", + nr_pages += count_pages_for_highmem(nr_highmem); + pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n", nr_pages, PAGES_FOR_IO, meta, free); return free > nr_pages + PAGES_FOR_IO + meta; } +#ifdef CONFIG_HIGHMEM +/** + * get_highmem_buffer - if there are some highmem pages in the suspend + * image, we may need the buffer to copy them and/or load their data. + */ + +static inline int get_highmem_buffer(int safe_needed) +{ + buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + return buffer ? 0 : -ENOMEM; +} + +/** + * alloc_highmem_image_pages - allocate some highmem pages for the image. + * Try to allocate as many pages as needed, but if the number of free + * highmem pages is lesser than that, allocate them all. + */ + +static inline unsigned int +alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +{ + unsigned int to_alloc = count_free_highmem_pages(); + + if (to_alloc > nr_highmem) + to_alloc = nr_highmem; + + nr_highmem -= to_alloc; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_image_page(__GFP_HIGHMEM); + memory_bm_set_bit(bm, page_to_pfn(page)); + } + return nr_highmem; +} +#else +static inline int get_highmem_buffer(int safe_needed) { return 0; } + +static inline unsigned int +alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * swsusp_alloc - allocate memory for the suspend image + * + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. + * + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. + */ + static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages) + unsigned int nr_pages, unsigned int nr_highmem) { int error; @@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, if (error) goto Free; + if (nr_highmem > 0) { + error = get_highmem_buffer(PG_ANY); + if (error) + goto Free; + + nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); + } while (nr_pages-- > 0) { - struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); + struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) goto Free; - SetPageNosave(page); - SetPageNosaveFree(page); memory_bm_set_bit(copy_bm, page_to_pfn(page)); } return 0; -Free: + Free: swsusp_free(); return -ENOMEM; } -/* Memory bitmap used for marking saveable pages */ +/* Memory bitmap used for marking saveable pages (during suspend) or the + * suspend image pages (during resume) + */ static struct memory_bitmap orig_bm; -/* Memory bitmap used for marking allocated pages that will contain the copies - * of saveable pages +/* Memory bitmap used on suspend for marking allocated pages that will contain + * the copies of saveable pages. During resume it is initially used for + * marking the suspend image pages, but then its set bits are duplicated in + * @orig_bm and it is released. Next, on systems with high memory, it may be + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. */ static struct memory_bitmap copy_bm; asmlinkage int swsusp_save(void) { - unsigned int nr_pages; + unsigned int nr_pages, nr_highmem; - pr_debug("swsusp: critical section: \n"); + printk("swsusp: critical section: \n"); drain_local_pages(); nr_pages = count_data_pages(); - printk("swsusp: Need to copy %u pages\n", nr_pages); + nr_highmem = count_highmem_pages(); + printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem); - if (!enough_free_mem(nr_pages)) { + if (!enough_free_mem(nr_pages, nr_highmem)) { printk(KERN_ERR "swsusp: Not enough free memory\n"); return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) + if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + printk(KERN_ERR "swsusp: Memory allocation failed\n"); return -ENOMEM; + } /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. @@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void) * touch swap space! Except we must write out our image of course. */ + nr_pages += nr_highmem; nr_copy_pages = nr_pages; - nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + return 0; } @@ -906,7 +1029,7 @@ static void init_header(struct swsusp_info *info) memset(info, 0, sizeof(struct swsusp_info)); info->version_code = LINUX_VERSION_CODE; info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; @@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); + buffer = get_image_page(GFP_ATOMIC, PG_ANY); if (!buffer) return -ENOMEM; } @@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) memset(buffer, 0, PAGE_SIZE); pack_pfns(buffer, &orig_bm); } else { - unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page; - handle->buffer = page_address(pfn_to_page(pfn)); + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buffer, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); + } } handle->prev = handle->cur; } @@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone (zone) { + for_each_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) @@ -1050,13 +1187,13 @@ static inline int check_header(struct swsusp_info *info) reason = "kernel version"; if (info->num_physpages != num_physpages) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) + if (strcmp(info->uts.sysname,init_utsname()->sysname)) reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) + if (strcmp(info->uts.release,init_utsname()->release)) reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) + if (strcmp(info->uts.version,init_utsname()->version)) reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) + if (strcmp(info->uts.machine,init_utsname()->machine)) reason = "machine"; if (reason) { printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); @@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) } } +/* List of "safe" pages that may be used to store data loaded from the suspend + * image + */ +static struct linked_page *safe_pages_list; + +#ifdef CONFIG_HIGHMEM +/* struct highmem_pbe is used for creating the list of highmem pages that + * should be restored atomically during the resume from disk, because the page + * frames they have occupied before the suspend are in use. + */ +struct highmem_pbe { + struct page *copy_page; /* data is here now */ + struct page *orig_page; /* data was here before the suspend */ + struct highmem_pbe *next; +}; + +/* List of highmem PBEs needed for restoring the highmem pages that were + * allocated before the suspend and included in the suspend image, but have + * also been allocated by the "resume" kernel, so their contents cannot be + * written directly to their "original" page frames. + */ +static struct highmem_pbe *highmem_pblist; + +/** + * count_highmem_image_pages - compute the number of highmem pages in the + * suspend image. The bits in the memory bitmap @bm that correspond to the + * image pages are assumed to be set. + */ + +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) +{ + unsigned long pfn; + unsigned int cnt = 0; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (PageHighMem(pfn_to_page(pfn))) + cnt++; + + pfn = memory_bm_next_pfn(bm); + } + return cnt; +} + +/** + * prepare_highmem_image - try to allocate as many highmem pages as + * there are highmem image pages (@nr_highmem_p points to the variable + * containing the number of highmem image pages). The pages that are + * "safe" (ie. will not be overwritten when the suspend image is + * restored) have the corresponding bits set in @bm (it must be + * unitialized). + * + * NOTE: This function should not be called if there are no highmem + * image pages. + */ + +static unsigned int safe_highmem_pages; + +static struct memory_bitmap *safe_highmem_bm; + +static int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + unsigned int to_alloc; + + if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) + return -ENOMEM; + + if (get_highmem_buffer(PG_SAFE)) + return -ENOMEM; + + to_alloc = count_free_highmem_pages(); + if (to_alloc > *nr_highmem_p) + to_alloc = *nr_highmem_p; + else + *nr_highmem_p = to_alloc; + + safe_highmem_pages = 0; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_page(__GFP_HIGHMEM); + if (!PageNosaveFree(page)) { + /* The page is "safe", set its bit the bitmap */ + memory_bm_set_bit(bm, page_to_pfn(page)); + safe_highmem_pages++; + } + /* Mark the page as allocated */ + SetPageNosave(page); + SetPageNosaveFree(page); + } + memory_bm_position_reset(bm); + safe_highmem_bm = bm; + return 0; +} + +/** + * get_highmem_page_buffer - for given highmem image page find the buffer + * that suspend_write_next() should set for its caller to write to. + * + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem page is set to the page to which + * the data will have to be copied from @buffer. + */ + +static struct page *last_highmem_page; + +static void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + struct highmem_pbe *pbe; + void *kaddr; + + if (PageNosave(page) && PageNosaveFree(page)) { + /* We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + last_highmem_page = page; + return buffer; + } + /* The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); + if (!pbe) { + swsusp_free(); + return NULL; + } + pbe->orig_page = page; + if (safe_highmem_pages > 0) { + struct page *tmp; + + /* Copy of the page will be stored in high memory */ + kaddr = buffer; + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); + safe_highmem_pages--; + last_highmem_page = tmp; + pbe->copy_page = tmp; + } else { + /* Copy of the page will be stored in normal memory */ + kaddr = safe_pages_list; + safe_pages_list = safe_pages_list->next; + pbe->copy_page = virt_to_page(kaddr); + } + pbe->next = highmem_pblist; + highmem_pblist = pbe; + return kaddr; +} + +/** + * copy_last_highmem_page - copy the contents of a highmem image from + * @buffer, where the caller of snapshot_write_next() has place them, + * to the right location represented by @last_highmem_page . + */ + +static void copy_last_highmem_page(void) +{ + if (last_highmem_page) { + void *dst; + + dst = kmap_atomic(last_highmem_page, KM_USER0); + memcpy(dst, buffer, PAGE_SIZE); + kunmap_atomic(dst, KM_USER0); + last_highmem_page = NULL; + } +} + +static inline int last_highmem_page_copied(void) +{ + return !last_highmem_page; +} + +static inline void free_highmem_data(void) +{ + if (safe_highmem_bm) + memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); + + if (buffer) + free_image_page(buffer, PG_UNSAFE_CLEAR); +} +#else +static inline int get_safe_write_buffer(void) { return 0; } + +static unsigned int +count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } + +static inline int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + return 0; +} + +static inline void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + return NULL; +} + +static inline void copy_last_highmem_page(void) {} +static inline int last_highmem_page_copied(void) { return 1; } +static inline void free_highmem_data(void) {} +#endif /* CONFIG_HIGHMEM */ + /** * prepare_image - use the memory bitmap @bm to mark the pages that will * be overwritten in the process of restoring the system memory state @@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) * The idea is to allocate a new memory bitmap first and then allocate * as many pages as needed for the image data, but not to assign these * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a list of "safe" pages that will be used later. + * allocated and create a lists of "safe" pages that will be used + * later. On systems with high memory a list of "safe" highmem pages is + * also created. */ #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) -static struct linked_page *safe_pages_list; - static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { - unsigned int nr_pages; + unsigned int nr_pages, nr_highmem; struct linked_page *sp_list, *lp; int error; + /* If there is no highmem, the buffer will not be necessary */ + free_image_page(buffer, PG_UNSAFE_CLEAR); + buffer = NULL; + + nr_highmem = count_highmem_image_pages(bm); error = mark_unsafe_pages(bm); if (error) goto Free; @@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + if (nr_highmem > 0) { + error = prepare_highmem_image(bm, &nr_highmem); + if (error) + goto Free; + } /* Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the @@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) */ sp_list = NULL; /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ - nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { - lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); + lp = get_image_page(GFP_ATOMIC, PG_SAFE); if (!lp) { error = -ENOMEM; goto Free; @@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /* Preallocate memory for the image */ safe_pages_list = NULL; - nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { @@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } return 0; -Free: + Free: swsusp_free(); return error; } @@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) struct pbe *pbe; struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + if (PageHighMem(page)) + return get_highmem_page_buffer(page, ca); + if (PageNosave(page) && PageNosaveFree(page)) /* We have allocated the "original" page frame and we can * use it directly to store the loaded page. @@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) swsusp_free(); return NULL; } - pbe->orig_address = (unsigned long)page_address(page); - pbe->address = (unsigned long)safe_pages_list; + pbe->orig_address = page_address(page); + pbe->address = safe_pages_list; safe_pages_list = safe_pages_list->next; pbe->next = restore_pblist; restore_pblist = pbe; - return (void *)pbe->address; + return pbe->address; } /** @@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) return 0; - if (!buffer) { - /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); + if (handle->offset == 0) { + if (!buffer) + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + if (!buffer) return -ENOMEM; - } - if (!handle->offset) + handle->buffer = buffer; + } handle->sync_read = 1; if (handle->prev < handle->cur) { if (handle->prev == 0) { @@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return -ENOMEM; } } else { + copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; + if (handle->buffer != buffer) + handle->sync_read = 0; } handle->prev = handle->cur; } @@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return count; } +/** + * snapshot_write_finalize - must be called after the last call to + * snapshot_write_next() in case the last page in the image happens + * to be a highmem page and its contents should be stored in the + * highmem. Additionally, it releases the memory that will not be + * used any more. + */ + +void snapshot_write_finalize(struct snapshot_handle *handle) +{ + copy_last_highmem_page(); + /* Free only if we have loaded the image entirely */ + if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { + memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + free_highmem_data(); + } +} + int snapshot_image_loaded(struct snapshot_handle *handle) { - return !(!nr_copy_pages || + return !(!nr_copy_pages || !last_highmem_page_copied() || handle->cur <= nr_meta_pages + nr_copy_pages); } -void snapshot_free_unused_memory(struct snapshot_handle *handle) +#ifdef CONFIG_HIGHMEM +/* Assumes that @buf is ready and points to a "safe" page */ +static inline void +swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { - /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + void *kaddr1, *kaddr2; + + kaddr1 = kmap_atomic(p1, KM_USER0); + kaddr2 = kmap_atomic(p2, KM_USER1); + memcpy(buf, kaddr1, PAGE_SIZE); + memcpy(kaddr1, kaddr2, PAGE_SIZE); + memcpy(kaddr2, buf, PAGE_SIZE); + kunmap_atomic(kaddr1, KM_USER0); + kunmap_atomic(kaddr2, KM_USER1); +} + +/** + * restore_highmem - for each highmem page that was allocated before + * the suspend and included in the suspend image, and also has been + * allocated by the "resume" kernel swap its current (ie. "before + * resume") contents with the previous (ie. "before suspend") one. + * + * If the resume eventually fails, we can call this function once + * again and restore the "before resume" highmem state. + */ + +int restore_highmem(void) +{ + struct highmem_pbe *pbe = highmem_pblist; + void *buf; + + if (!pbe) + return 0; + + buf = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!buf) + return -ENOMEM; + + while (pbe) { + swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); + pbe = pbe->next; + } + free_image_page(buf, PG_UNSAFE_CLEAR); + return 0; } +#endif /* CONFIG_HIGHMEM */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9b2ee5344dee..f133d4a6d817 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -34,34 +34,123 @@ extern char resume_file[]; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; - swp_entry_t image; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; + sector_t image; char orig_sig[10]; char sig[10]; } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; /* - * Saving part... + * General things */ static unsigned short root_swap = 0xffff; +static struct block_device *resume_bdev; + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, pgoff_t page_off, struct page *page, + struct bio **bio_chain) +{ + struct bio *bio; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + if (!bio) + return -ENOMEM; + bio->bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = resume_bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); -static int mark_swapfiles(swp_entry_t start) + if (bio_chain == NULL) { + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + } + return 0; +} + +static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, page_off, virt_to_page(addr), bio_chain); +} + +static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(WRITE, page_off, virt_to_page(addr), bio_chain); +} + +static int wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} + +/* + * Saving part + */ + +static int mark_swapfiles(sector_t start) { int error; - rw_swap_page_sync(READ, swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header), NULL); + bio_read_page(swsusp_resume_block, &swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); swsusp_header.image = start; - error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header), - NULL); + error = bio_write_page(swsusp_resume_block, + &swsusp_header, NULL); } else { - pr_debug("swsusp: Partition is not swap space.\n"); + printk(KERN_ERR "swsusp: Swap header not found!\n"); error = -ENODEV; } return error; @@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start) static int swsusp_swap_check(void) /* This is called before saving image */ { - int res = swap_type_of(swsusp_resume_device); + int res; + + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + if (res < 0) + return res; + + root_swap = res; + resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE); + if (IS_ERR(resume_bdev)) + return PTR_ERR(resume_bdev); + + res = set_blocksize(resume_bdev, PAGE_SIZE); + if (res < 0) + blkdev_put(resume_bdev); - if (res >= 0) { - root_swap = res; - return 0; - } return res; } @@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */ * @bio_chain: Link the next write BIO here */ -static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { - swp_entry_t entry; - int error = -ENOSPC; - - if (offset) { - struct page *page = virt_to_page(buf); - - if (bio_chain) { - /* - * Whether or not we successfully allocated a copy page, - * we take a ref on the page here. It gets undone in - * wait_on_bio_chain(). - */ - struct page *page_copy; - page_copy = alloc_page(GFP_ATOMIC); - if (page_copy == NULL) { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - get_page(page); - } else { - memcpy(page_address(page_copy), - page_address(page), PAGE_SIZE); - page = page_copy; - } + void *src; + + if (!offset) + return -ENOSPC; + + if (bio_chain) { + src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (src) { + memcpy(src, buf, PAGE_SIZE); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; } - entry = swp_entry(root_swap, offset); - error = rw_swap_page_sync(WRITE, entry, page, bio_chain); + } else { + src = buf; } - return error; + return bio_write_page(offset, src, bio_chain); } /* @@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) * at a time. */ -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) struct swap_map_page { - unsigned long entries[MAP_PAGE_ENTRIES]; - unsigned long next_swap; + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; }; /** @@ -151,7 +239,7 @@ struct swap_map_page { struct swap_map_handle { struct swap_map_page *cur; - unsigned long cur_swap; + sector_t cur_swap; struct bitmap_page *bitmap; unsigned int k; }; @@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle) handle->bitmap = NULL; } -static void show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} - static int get_swap_writer(struct swap_map_handle *handle) { handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); @@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle) release_swap_writer(handle); return -ENOMEM; } - handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); + handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap); if (!handle->cur_swap) { release_swap_writer(handle); return -ENOSPC; @@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle) return 0; } -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} - static int swap_write_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { int error = 0; - unsigned long offset; + sector_t offset; if (!handle->cur) return -EINVAL; - offset = alloc_swap_page(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap, handle->bitmap); error = write_page(buf, offset, bio_chain); if (error) return error; @@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = wait_on_bio_chain(bio_chain); if (error) goto out; - offset = alloc_swap_page(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap, handle->bitmap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; @@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, handle->cur_swap = offset; handle->k = 0; } -out: + out: return error; } @@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle, error = err2; if (!error) printk("\b\b\b\bdone\n"); - show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); return error; } @@ -350,99 +390,50 @@ int swsusp_write(void) struct swsusp_info *header; int error; - if ((error = swsusp_swap_check())) { + error = swsusp_swap_check(); + if (error) { printk(KERN_ERR "swsusp: Cannot find swap device, try " "swapon -a.\n"); return error; } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot, PAGE_SIZE); - if (error < PAGE_SIZE) - return error < 0 ? error : -EFAULT; + if (error < PAGE_SIZE) { + if (error >= 0) + error = -EFAULT; + + goto out; + } header = (struct swsusp_info *)data_of(snapshot); if (!enough_swap(header->pages)) { printk(KERN_ERR "swsusp: Not enough free swap\n"); - return -ENOSPC; + error = -ENOSPC; + goto out; } error = get_swap_writer(&handle); if (!error) { - unsigned long start = handle.cur_swap; + sector_t start = handle.cur_swap; + error = swap_write_page(&handle, header, NULL); if (!error) error = save_image(&handle, &snapshot, header->pages - 1); + if (!error) { flush_swap_writer(&handle); printk("S"); - error = mark_swapfiles(swp_entry(root_swap, start)); + error = mark_swapfiles(start); printk("|\n"); } } if (error) free_all_swap_pages(root_swap, handle.bitmap); release_swap_writer(&handle); + out: + swsusp_close(); return error; } -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - struct bio *bio; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - get_page(page); - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - } - return 0; -} - -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, page_off, virt_to_page(addr), bio_chain); -} - -static int bio_write_page(pgoff_t page_off, void *addr) -{ - return submit(WRITE, page_off, virt_to_page(addr), NULL); -} - /** * The following functions allow us to read data using a swap map * in a file-alike way @@ -455,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle) handle->cur = NULL; } -static int get_swap_reader(struct swap_map_handle *handle, - swp_entry_t start) +static int get_swap_reader(struct swap_map_handle *handle, sector_t start) { int error; - if (!swp_offset(start)) + if (!start) return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + + handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); if (!handle->cur) return -ENOMEM; - error = bio_read_page(swp_offset(start), handle->cur, NULL); + + error = bio_read_page(start, handle->cur, NULL); if (error) { release_swap_reader(handle); return error; @@ -477,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle, static int swap_read_page(struct swap_map_handle *handle, void *buf, struct bio **bio_chain) { - unsigned long offset; + sector_t offset; int error; if (!handle->cur) @@ -546,11 +538,11 @@ static int load_image(struct swap_map_handle *handle, error = err2; if (!error) { printk("\b\b\b\bdone\n"); - snapshot_free_unused_memory(snapshot); + snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; } - show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } @@ -599,12 +591,16 @@ int swsusp_check(void) if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header, NULL))) + error = bio_read_page(swsusp_resume_block, + &swsusp_header, NULL); + if (error) return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(0, &swsusp_header); + error = bio_write_page(swsusp_resume_block, + &swsusp_header, NULL); } else { return -EINVAL; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 0b66659dc516..31aa0390c777 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -49,6 +49,7 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/highmem.h> +#include <linux/time.h> #include "power.h" @@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); -int save_highmem(void); int restore_highmem(void); #else -static inline int save_highmem(void) { return 0; } static inline int restore_highmem(void) { return 0; } static inline unsigned int count_highmem_pages(void) { return 0; } #endif @@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) return 0; } -unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) +sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) { unsigned long offset; offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { - if (bitmap_set(bitmap, offset)) { + if (bitmap_set(bitmap, offset)) swap_free(swp_entry(swap, offset)); - offset = 0; - } + else + return swapdev_block(swap, offset); } - return offset; + return 0; } void free_all_swap_pages(int swap, struct bitmap_page *bitmap) @@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) } /** + * swsusp_show_speed - print the time elapsed between two events represented by + * @start and @stop + * + * @nr_pages - number of pages processed between @start and @stop + * @msg - introductory message to print + */ + +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/** * swsusp_shrink_memory - Try to free as much memory as needed * * ... but do not OOM-kill anyone @@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp) int swsusp_shrink_memory(void) { - long size, tmp; + long tmp; struct zone *zone; unsigned long pages = 0; unsigned int i = 0; char *p = "-\\|/"; + struct timeval start, stop; printk("Shrinking memory... "); + do_gettimeofday(&start); do { - size = 2 * count_highmem_pages(); - size += size / 50 + count_data_pages() + PAGES_FOR_IO; + long size, highmem_size; + + highmem_size = count_highmem_pages(); + size = count_data_pages() + PAGES_FOR_IO; tmp = size; + size += highmem_size; for_each_zone (zone) - if (!is_highmem(zone) && populated_zone(zone)) { - tmp -= zone->free_pages; - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - tmp += snapshot_additional_pages(zone); + if (populated_zone(zone)) { + if (is_highmem(zone)) { + highmem_size -= zone->free_pages; + } else { + tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + tmp += snapshot_additional_pages(zone); + } } + + if (highmem_size < 0) + highmem_size = 0; + + tmp += highmem_size; if (tmp > 0) { tmp = __shrink_memory(tmp); if (!tmp) @@ -212,7 +253,9 @@ int swsusp_shrink_memory(void) } printk("\b%c", p[i++%4]); } while (tmp > 0); + do_gettimeofday(&stop); printk("\bdone (%lu pages freed)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Freed"); return 0; } @@ -223,6 +266,7 @@ int swsusp_suspend(void) if ((error = arch_prepare_suspend())) return error; + local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* device_power_down() now. @@ -235,23 +279,16 @@ int swsusp_suspend(void) goto Enable_irqs; } - if ((error = save_highmem())) { - printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); - goto Restore_highmem; - } - save_processor_state(); if ((error = swsusp_arch_suspend())) printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); -Restore_highmem: - restore_highmem(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ device_power_up(); -Enable_irqs: + Enable_irqs: local_irq_enable(); return error; } @@ -268,18 +305,23 @@ int swsusp_resume(void) printk(KERN_ERR "Some devices failed to power down, very bad\n"); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); - error = swsusp_arch_resume(); - /* Code below is only ever reached in case of failure. Otherwise - * execution continues at place where swsusp_arch_suspend was called - */ - BUG_ON(!error); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* The code below is only ever reached in case of a failure. + * Otherwise execution continues at place where + * swsusp_arch_suspend() was called + */ + BUG_ON(!error); + /* This call to restore_highmem() undos the previous one */ + restore_highmem(); + } /* The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid * subsequent failures */ swsusp_free(); restore_processor_state(); - restore_highmem(); touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 72825c853cd7..89443b85163b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -11,6 +11,7 @@ #include <linux/suspend.h> #include <linux/syscalls.h> +#include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> @@ -19,7 +20,9 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> +#include <linux/console.h> #include <linux/cpu.h> +#include <linux/freezer.h> #include <asm/uaccess.h> @@ -53,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp) filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { - data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; + data->swap = swsusp_resume_device ? + swap_type_of(swsusp_resume_device, 0) : -1; data->mode = O_RDONLY; } else { data->swap = -1; @@ -75,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_all_swap_pages(data->swap, data->bitmap); free_bitmap(data->bitmap); if (data->frozen) { - down(&pm_sem); + mutex_lock(&pm_mutex); thaw_processes(); enable_nonboot_cpus(); - up(&pm_sem); + mutex_unlock(&pm_mutex); } atomic_inc(&device_available); return 0; @@ -123,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, { int error = 0; struct snapshot_data *data; - loff_t offset, avail; + loff_t avail; + sector_t offset; if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; @@ -139,17 +144,17 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_FREEZE: if (data->frozen) break; - down(&pm_sem); + mutex_lock(&pm_mutex); error = disable_nonboot_cpus(); if (!error) { error = freeze_processes(); if (error) { thaw_processes(); + enable_nonboot_cpus(); error = -EBUSY; } } - enable_nonboot_cpus(); - up(&pm_sem); + mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; break; @@ -157,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, case SNAPSHOT_UNFREEZE: if (!data->frozen) break; - down(&pm_sem); + mutex_lock(&pm_mutex); thaw_processes(); enable_nonboot_cpus(); - up(&pm_sem); + mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -169,18 +174,20 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - down(&pm_sem); + mutex_lock(&pm_mutex); /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (!error) { + suspend_console(); error = device_suspend(PMSG_FREEZE); if (!error) { in_suspend = 1; error = swsusp_suspend(); device_resume(); } + resume_console(); } - up(&pm_sem); + mutex_unlock(&pm_mutex); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -188,21 +195,23 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_ATOMIC_RESTORE: + snapshot_write_finalize(&data->handle); if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; break; } - snapshot_free_unused_memory(&data->handle); - down(&pm_sem); + mutex_lock(&pm_mutex); pm_prepare_console(); + suspend_console(); error = device_suspend(PMSG_PRETHAW); if (!error) { error = swsusp_resume(); device_resume(); } + resume_console(); pm_restore_console(); - up(&pm_sem); + mutex_unlock(&pm_mutex); break; case SNAPSHOT_FREE: @@ -233,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; } } - offset = alloc_swap_page(data->swap, data->bitmap); + offset = alloc_swapdev_block(data->swap, data->bitmap); if (offset) { offset <<= PAGE_SHIFT; - error = put_user(offset, (loff_t __user *)arg); + error = put_user(offset, (sector_t __user *)arg); } else { error = -ENOSPC; } @@ -259,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, * so we need to recode them */ if (old_decode_dev(arg)) { - data->swap = swap_type_of(old_decode_dev(arg)); + data->swap = swap_type_of(old_decode_dev(arg), 0); if (data->swap < 0) error = -ENODEV; } else { @@ -277,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; } - if (down_trylock(&pm_sem)) { + if (!mutex_trylock(&pm_mutex)) { error = -EBUSY; break; } @@ -289,6 +298,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, } /* Put devices to sleep */ + suspend_console(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "Failed to suspend some devices.\n"); @@ -299,12 +309,70 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, /* Wake up devices */ device_resume(); } - + resume_console(); if (pm_ops->finish) pm_ops->finish(PM_SUSPEND_MEM); -OutS3: - up(&pm_sem); + OutS3: + mutex_unlock(&pm_mutex); + break; + + case SNAPSHOT_PMOPS: + switch (arg) { + + case PMOPS_PREPARE: + if (pm_ops->prepare) { + error = pm_ops->prepare(PM_SUSPEND_DISK); + } + break; + + case PMOPS_ENTER: + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + error = pm_ops->enter(PM_SUSPEND_DISK); + break; + + case PMOPS_FINISH: + if (pm_ops && pm_ops->finish) { + pm_ops->finish(PM_SUSPEND_DISK); + } + break; + + default: + printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); + error = -EINVAL; + + } + break; + + case SNAPSHOT_SET_SWAP_AREA: + if (data->bitmap) { + error = -EPERM; + } else { + struct resume_swap_area swap_area; + dev_t swdev; + + error = copy_from_user(&swap_area, (void __user *)arg, + sizeof(struct resume_swap_area)); + if (error) { + error = -EFAULT; + break; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + swdev = old_decode_dev(swap_area.dev); + if (swdev) { + offset = swap_area.offset; + data->swap = swap_type_of(swdev, offset); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } break; default: @@ -315,7 +383,7 @@ OutS3: return error; } -static struct file_operations snapshot_fops = { +static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, diff --git a/kernel/printk.c b/kernel/printk.c index 771f5e861bcd..185bb45eacf7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/syscalls.h> +#include <linux/jiffies.h> #include <asm/uaccess.h> @@ -52,8 +53,6 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; -EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ - /* * Low lever drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -334,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end) } } +static int __read_mostly ignore_loglevel; + +int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 1; +} + +__setup("ignore_loglevel", ignore_loglevel_setup); + /* * Write out chars from start to end - 1 inclusive */ static void _call_console_drivers(unsigned long start, unsigned long end, int msg_log_level) { - if (msg_log_level < console_loglevel && + if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { /* wrapped write */ @@ -630,12 +641,7 @@ EXPORT_SYMBOL(vprintk); asmlinkage long sys_syslog(int type, char __user *buf, int len) { - return 0; -} - -int do_syslog(int type, char __user *buf, int len) -{ - return 0; + return -ENOSYS; } static void call_console_drivers(unsigned long start, unsigned long end) @@ -776,7 +782,6 @@ int is_console_locked(void) { return console_locked; } -EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ /** * release_console_sem - unlock the console system @@ -820,15 +825,8 @@ void release_console_sem(void) console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { - /* - * If we printk from within the lock dependency code, - * from within the scheduler code, then do not lock - * up due to self-recursion: - */ - if (!lockdep_internal()) - wake_up_interruptible(&log_wait); - } + if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) + wake_up_interruptible(&log_wait); } EXPORT_SYMBOL(release_console_sem); @@ -1108,3 +1106,23 @@ int printk_ratelimit(void) printk_ratelimit_burst); } EXPORT_SYMBOL(printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { + *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/profile.c b/kernel/profile.c index fb660c7d35ba..fb5e03d57e9d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -25,6 +25,7 @@ #include <linux/mutex.h> #include <asm/sections.h> #include <asm/semaphore.h> +#include <asm/irq_regs.h> struct profile_hit { u32 pc, hits; @@ -39,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; -static int prof_on __read_mostly; +int prof_on __read_mostly; static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); @@ -50,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex); static int __init profile_setup(char * str) { static char __initdata schedstr[] = "schedule"; + static char __initdata sleepstr[] = "sleep"; int par; - if (!strncmp(str, schedstr, strlen(schedstr))) { + if (!strncmp(str, sleepstr, strlen(sleepstr))) { + prof_on = SLEEP_PROFILING; + if (str[strlen(sleepstr)] == ',') + str += strlen(sleepstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel sleep profiling enabled (shift: %ld)\n", + prof_shift); + } else if (!strncmp(str, sleepstr, strlen(sleepstr))) { prof_on = SCHED_PROFILING; if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; @@ -203,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); * positions to which hits are accounted during short intervals (e.g. * several seconds) is usually very small. Exclusion from buffer * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING profile_hit() may be called from process context). + * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from + * process context). * The hash function is meant to be lightweight as opposed to strong, * and was vaguely inspired by ppc64 firmware-supported inverted * pagetable hash functions, but uses a full hashtable full of finite @@ -256,7 +268,7 @@ static void profile_discard_flip_buffers(void) mutex_unlock(&profile_flip_mutex); } -void profile_hit(int type, void *__pc) +void profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; @@ -273,21 +285,31 @@ void profile_hit(int type, void *__pc) put_cpu(); return; } + /* + * We buffer the global profiler buffer into a per-CPU + * queue and thus reduce the number of global (and possibly + * NUMA-alien) accesses. The write-queue is self-coalescing: + */ local_irq_save(flags); do { for (j = 0; j < PROFILE_GRPSZ; ++j) { if (hits[i + j].pc == pc) { - hits[i + j].hits++; + hits[i + j].hits += nr_hits; goto out; } else if (!hits[i + j].hits) { hits[i + j].pc = pc; - hits[i + j].hits = 1; + hits[i + j].hits = nr_hits; goto out; } } i = (i + secondary) & (NR_PROFILE_HIT - 1); } while (i != primary); - atomic_inc(&prof_buffer[pc]); + + /* + * Add the current hit(s) and flush the write-queue out + * to the global buffer: + */ + atomic_add(nr_hits, &prof_buffer[pc]); for (i = 0; i < NR_PROFILE_HIT; ++i) { atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); hits[i].pc = hits[i].hits = 0; @@ -297,7 +319,6 @@ out: put_cpu(); } -#ifdef CONFIG_HOTPLUG_CPU static int __devinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { @@ -350,24 +371,26 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, } return NOTIFY_OK; } -#endif /* CONFIG_HOTPLUG_CPU */ #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) +#define profile_cpu_callback NULL -void profile_hit(int type, void *__pc) +void profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; if (prof_on != type || !prof_buffer) return; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); + atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ -void profile_tick(int type, struct pt_regs *regs) +void profile_tick(int type) { + struct pt_regs *regs = get_irq_regs(); + if (type == CPU_PROFILING && timer_hook) timer_hook(regs); if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) @@ -396,7 +419,7 @@ static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffe unsigned long full_count = count, err; cpumask_t new_value; - err = cpumask_parse(buffer, count, new_value); + err = cpumask_parse_user(buffer, count, new_value); if (err) return err; @@ -439,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) read = 0; while (p < sizeof(unsigned int) && count > 0) { - put_user(*((char *)(&sample_step)+p),buf); + if (put_user(*((char *)(&sample_step)+p),buf)) + return -EFAULT; buf++; p++; count--; read++; } pnt = (char *)prof_buffer + p - sizeof(atomic_t); @@ -477,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return count; } -static struct file_operations proc_profile_operations = { +static const struct file_operations proc_profile_operations = { .read = read_profile, .write = write_profile, }; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9a111f70145c..4d50e06fd745 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data) return 0; } -/* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages - */ - -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) -{ - struct mm_struct *mm; - struct vm_area_struct *vma; - struct page *page; - void *old_buf = buf; - - mm = get_task_mm(tsk); - if (!mm) - return 0; - - down_read(&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) { - int bytes, ret, offset; - void *maddr; - - ret = get_user_pages(tsk, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); - } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); - } - kunmap(page); - page_cache_release(page); - len -= bytes; - buf += bytes; - addr += bytes; - } - up_read(&mm->mmap_sem); - mmput(mm); - - return buf - old_buf; -} - int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) { int copied = 0; @@ -494,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) child = find_task_by_pid(pid); if (child) get_task_struct(child); + read_unlock(&tasklist_lock); if (!child) return ERR_PTR(-ESRCH); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 523e46483b99..3554b76da84c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -71,9 +71,6 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int blimit = 10; static int qhimark = 10000; static int qlowmark = 100; -#ifdef CONFIG_SMP -static int rsinterval = 1000; -#endif static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -86,8 +83,8 @@ static void force_quiescent_state(struct rcu_data *rdp, int cpu; cpumask_t cpumask; set_need_resched(); - if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { - rdp->last_rs_qlen = rdp->qlen; + if (unlikely(!rcp->signaled)) { + rcp->signaled = 1; /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. @@ -238,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp) list = rdp->donelist; while (list) { - next = rdp->donelist = list->next; + next = list->next; + prefetch(next); list->func(list); list = next; if (++count >= rdp->blimit) break; } + rdp->donelist = list; local_irq_disable(); rdp->qlen -= count; @@ -301,6 +300,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) smp_mb(); cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); + rcp->signaled = 0; } } @@ -628,9 +628,6 @@ void synchronize_rcu(void) module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -#ifdef CONFIG_SMP -module_param(rsinterval, int, 0); -#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); EXPORT_SYMBOL_GPL(call_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 4d1c3d247127..c52f981ea008 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -15,9 +15,10 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright (C) IBM Corporation, 2005 + * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Josh Triplett <josh@freedesktop.org> * * See also: Documentation/RCU/torture.txt */ @@ -44,19 +45,25 @@ #include <linux/delay.h> #include <linux/byteorder/swabb.h> #include <linux/stat.h> +#include <linux/srcu.h> MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " + "Josh Triplett <josh@freedesktop.org>"); -static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ +static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ +static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -static char *torture_type = "rcu"; /* What to torture. */ +static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +module_param(nfakewriters, int, 0); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); module_param(stat_interval, int, 0); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); module_param(verbose, bool, 0); @@ -66,7 +73,7 @@ MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); module_param(torture_type, charp, 0); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ @@ -80,6 +87,7 @@ static char printk_buf[4096]; static int nrealreaders; static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; @@ -104,11 +112,12 @@ static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -atomic_t n_rcu_torture_alloc; -atomic_t n_rcu_torture_alloc_fail; -atomic_t n_rcu_torture_free; -atomic_t n_rcu_torture_mberror; -atomic_t n_rcu_torture_error; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; +static struct list_head rcu_torture_removed; /* * Allocate an element from the rcu_tortures pool. @@ -145,7 +154,7 @@ rcu_torture_free(struct rcu_torture *p) struct rcu_random_state { unsigned long rrs_state; - unsigned long rrs_count; + long rrs_count; }; #define RCU_RANDOM_MULT 39916801 /* prime */ @@ -158,7 +167,7 @@ struct rcu_random_state { * Crude but fast random-number generator. Uses a linear congruential * generator, with occasional help from get_random_bytes(). */ -static long +static unsigned long rcu_random(struct rcu_random_state *rrsp) { long refresh; @@ -180,9 +189,11 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); + void (*readdelay)(struct rcu_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); + void (*sync)(void); int (*stats)(char *page); char *name; }; @@ -192,13 +203,25 @@ static struct rcu_torture_ops *cur_ops = NULL; * Definitions for rcu torture testing. */ -static int rcu_torture_read_lock(void) +static int rcu_torture_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_torture_read_unlock(int idx) +static void rcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long longdelay = 200; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); + if (!delay) + udelay(longdelay); +} + +static void rcu_torture_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -239,24 +262,65 @@ static struct rcu_torture_ops rcu_ops = { .init = NULL, .cleanup = NULL, .readlock = rcu_torture_read_lock, + .readdelay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, + .sync = synchronize_rcu, .stats = NULL, .name = "rcu" }; +static void rcu_sync_torture_deferred_free(struct rcu_torture *p) +{ + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + + cur_ops->sync(); + list_add(&p->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + +static struct rcu_torture_ops rcu_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .readdelay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .stats = NULL, + .name = "rcu_sync" +}; + /* * Definitions for rcu_bh torture testing. */ -static int rcu_bh_torture_read_lock(void) +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) { rcu_read_lock_bh(); return 0; } -static void rcu_bh_torture_read_unlock(int idx) +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) { rcu_read_unlock_bh(); } @@ -271,19 +335,176 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); } +struct rcu_bh_torture_synchronize { + struct rcu_head head; + struct completion completion; +}; + +static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) +{ + struct rcu_bh_torture_synchronize *rcu; + + rcu = container_of(head, struct rcu_bh_torture_synchronize, head); + complete(&rcu->completion); +} + +static void rcu_bh_torture_synchronize(void) +{ + struct rcu_bh_torture_synchronize rcu; + + init_completion(&rcu.completion); + call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); + wait_for_completion(&rcu.completion); +} + static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, .readlock = rcu_bh_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, .stats = NULL, .name = "rcu_bh" }; +static struct rcu_torture_ops rcu_bh_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .stats = NULL, + .name = "rcu_bh_sync" +}; + +/* + * Definitions for srcu torture testing. + */ + +static struct srcu_struct srcu_ctl; + +static void srcu_torture_init(void) +{ + init_srcu_struct(&srcu_ctl); + rcu_sync_torture_init(); +} + +static void srcu_torture_cleanup(void) +{ + synchronize_srcu(&srcu_ctl); + cleanup_srcu_struct(&srcu_ctl); +} + +static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); +} + +static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + +static int srcu_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + int idx = srcu_ctl.completed & 0x1; + + cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +static struct rcu_torture_ops srcu_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .readdelay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .stats = srcu_torture_stats, + .name = "srcu" +}; + +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static int sched_torture_completed(void) +{ + return 0; +} + +static void sched_torture_synchronize(void) +{ + synchronize_sched(); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .stats = NULL, + .name = "sched" +}; + static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_bh_ops, NULL }; + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, + &sched_ops, NULL }; /* * RCU torture writer kthread. Repeatedly substitutes a new structure @@ -330,6 +551,30 @@ rcu_torture_writer(void *arg) } /* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); + udelay(rcu_random(&rand) & 0x3ff); + cur_ops->sync(); + } while (!kthread_should_stop() && !fullstop); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The * counter in the element should never be greater than 1, otherwise, the @@ -359,7 +604,7 @@ rcu_torture_reader(void *arg) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - udelay(rcu_random(&rand) & 0x7f); + cur_ops->readdelay(&rand); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -483,7 +728,7 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. */ -void rcu_torture_shuffle_tasks(void) +static void rcu_torture_shuffle_tasks(void) { cpumask_t tmp_mask = CPU_MASK_ALL; int i; @@ -507,6 +752,12 @@ void rcu_torture_shuffle_tasks(void) set_cpus_allowed(reader_tasks[i], tmp_mask); } + if (fakewriter_tasks != NULL) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed(fakewriter_tasks[i], tmp_mask); + } + if (writer_task) set_cpus_allowed(writer_task, tmp_mask); @@ -540,11 +791,12 @@ rcu_torture_shuffle(void *arg) static inline void rcu_torture_print_module_parms(char *tag) { - printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " + printk(KERN_ALERT "%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval = %d\n", - torture_type, tag, nrealreaders, stat_interval, verbose, - test_no_idle_hz, shuffle_interval); + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval); } static void @@ -579,6 +831,19 @@ rcu_torture_cleanup(void) } rcu_torture_current = NULL; + if (fakewriter_tasks != NULL) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + if (stats_task != NULL) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); @@ -666,7 +931,25 @@ rcu_torture_init(void) writer_task = NULL; goto unwind; } - reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_PRINTK_ERRSTRING("out of memory"); diff --git a/kernel/relay.c b/kernel/relay.c index 33345e73485c..818e514729cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) * @buf: the buffer struct * @size: total size of the buffer * - * Returns a pointer to the resulting buffer, NULL if unsuccessful. The + * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The * passed in size will get page aligned, if it isn't already. */ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) @@ -132,10 +132,9 @@ depopulate: /** * relay_create_buf - allocate and initialize a channel buffer - * @alloc_size: size of the buffer to allocate - * @n_subbufs: number of sub-buffers in the channel + * @chan: the relay channel * - * Returns channel buffer if successful, NULL otherwise + * Returns channel buffer if successful, %NULL otherwise. */ struct rchan_buf *relay_create_buf(struct rchan *chan) { @@ -163,6 +162,7 @@ free_buf: /** * relay_destroy_channel - free the channel struct + * @kref: target kernel reference that contains the relay channel * * Should only be called from kref_put(). */ @@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) /** * relay_remove_buf - remove a channel buffer + * @kref: target kernel reference that contains the relay buffer * * Removes the file from the fileystem, which also frees the * rchan_buf_struct and the channel buffer. Should only be called from @@ -307,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = { * reason waking is deferred is that calling directly from write * causes problems if you're writing from say the scheduler. */ -static void wakeup_readers(void *private) +static void wakeup_readers(struct work_struct *work) { - struct rchan_buf *buf = private; + struct rchan_buf *buf = + container_of(work, struct rchan_buf, wake_readers.work); wake_up_interruptible(&buf->read_wait); } @@ -327,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - INIT_WORK(&buf->wake_readers, NULL, NULL); + INIT_DELAYED_WORK(&buf->wake_readers, NULL); } else { cancel_delayed_work(&buf->wake_readers); flush_scheduled_work(); @@ -374,7 +376,7 @@ void relay_reset(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_reset); -/** +/* * relay_open_buf - create a new relay channel buffer * * Internal - used by relay_open(). @@ -448,12 +450,12 @@ static inline void setup_callbacks(struct rchan *chan, /** * relay_open - create a new relay channel * @base_filename: base name of files to create - * @parent: dentry of parent directory, NULL for root directory + * @parent: dentry of parent directory, %NULL for root directory * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions * - * Returns channel pointer if successful, NULL otherwise. + * Returns channel pointer if successful, %NULL otherwise. * * Creates a channel buffer for each cpu using the sizes and * attributes specified. The created channel buffer files @@ -548,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->padding[old_subbuf]; smp_mb(); if (waitqueue_active(&buf->read_wait)) { - PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); + PREPARE_DELAYED_WORK(&buf->wake_readers, + wakeup_readers); schedule_delayed_work(&buf->wake_readers, 1); } } @@ -585,7 +588,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); * subbufs_consumed should be the number of sub-buffers newly consumed, * not the total consumed. * - * NOTE: kernel clients don't need to call this function if the channel + * NOTE: Kernel clients don't need to call this function if the channel * mode is 'overwrite'. */ void relay_subbufs_consumed(struct rchan *chan, @@ -641,7 +644,7 @@ EXPORT_SYMBOL_GPL(relay_close); * relay_flush - close the channel * @chan: the channel * - * Flushes all channel buffers i.e. forces buffer switch. + * Flushes all channel buffers, i.e. forces buffer switch. */ void relay_flush(struct rchan *chan) { @@ -669,7 +672,7 @@ EXPORT_SYMBOL_GPL(relay_flush); */ static int relay_file_open(struct inode *inode, struct file *filp) { - struct rchan_buf *buf = inode->u.generic_ip; + struct rchan_buf *buf = inode->i_private; kref_get(&buf->kref); filp->private_data = buf; @@ -729,7 +732,7 @@ static int relay_file_release(struct inode *inode, struct file *filp) return 0; } -/** +/* * relay_file_read_consume - update the consumed count for the buffer */ static void relay_file_read_consume(struct rchan_buf *buf, @@ -756,7 +759,7 @@ static void relay_file_read_consume(struct rchan_buf *buf, } } -/** +/* * relay_file_read_avail - boolean, are there unconsumed bytes available? */ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) @@ -793,6 +796,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) /** * relay_file_read_subbuf_avail - return bytes available in sub-buffer + * @read_pos: file read position + * @buf: relay channel buffer */ static size_t relay_file_read_subbuf_avail(size_t read_pos, struct rchan_buf *buf) @@ -818,6 +823,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, /** * relay_file_read_start_pos - find the first available byte to read + * @read_pos: file read position + * @buf: relay channel buffer * * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise @@ -844,6 +851,9 @@ static size_t relay_file_read_start_pos(size_t read_pos, /** * relay_file_read_end_pos - return the new read position + * @read_pos: file read position + * @buf: relay channel buffer + * @count: number of bytes to be read */ static size_t relay_file_read_end_pos(struct rchan_buf *buf, size_t read_pos, @@ -865,7 +875,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/** +/* * subbuf_read_actor - read up to one subbuf's worth of data */ static int subbuf_read_actor(size_t read_start, @@ -879,7 +889,7 @@ static int subbuf_read_actor(size_t read_start, from = buf->start + read_start; ret = avail; - if (copy_to_user(desc->arg.data, from, avail)) { + if (copy_to_user(desc->arg.buf, from, avail)) { desc->error = -EFAULT; ret = 0; } @@ -890,7 +900,7 @@ static int subbuf_read_actor(size_t read_start, return ret; } -/** +/* * subbuf_send_actor - send up to one subbuf's worth of data */ static int subbuf_send_actor(size_t read_start, @@ -933,30 +943,23 @@ typedef int (*subbuf_actor_t) (size_t read_start, read_descriptor_t *desc, read_actor_t actor); -/** +/* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ static inline ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - size_t count, subbuf_actor_t subbuf_actor, read_actor_t actor, - void *target) + read_descriptor_t *desc) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; - read_descriptor_t desc; int ret; - if (!count) + if (!desc->count) return 0; - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - mutex_lock(&filp->f_dentry->d_inode->i_mutex); + mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -966,19 +969,19 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, if (!avail) break; - avail = min(desc.count, avail); - ret = subbuf_actor(read_start, buf, avail, &desc, actor); - if (desc.error < 0) + avail = min(desc->count, avail); + ret = subbuf_actor(read_start, buf, avail, desc, actor); + if (desc->error < 0) break; if (ret) { relay_file_read_consume(buf, read_start, ret); *ppos = relay_file_read_end_pos(buf, read_start, ret); } - } while (desc.count && ret); - mutex_unlock(&filp->f_dentry->d_inode->i_mutex); + } while (desc->count && ret); + mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); - return desc.written; + return desc->written; } static ssize_t relay_file_read(struct file *filp, @@ -986,8 +989,13 @@ static ssize_t relay_file_read(struct file *filp, size_t count, loff_t *ppos) { - return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor, - NULL, buffer); + read_descriptor_t desc; + desc.written = 0; + desc.count = count; + desc.arg.buf = buffer; + desc.error = 0; + return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, + NULL, &desc); } static ssize_t relay_file_sendfile(struct file *filp, @@ -996,11 +1004,16 @@ static ssize_t relay_file_sendfile(struct file *filp, read_actor_t actor, void *target) { - return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor, - actor, target); + read_descriptor_t desc; + desc.written = 0; + desc.count = count; + desc.arg.data = target; + desc.error = 0; + return relay_file_read_subbufs(filp, ppos, subbuf_send_actor, + actor, &desc); } -struct file_operations relay_file_operations = { +const struct file_operations relay_file_operations = { .open = relay_file_open, .poll = relay_file_poll, .mmap = relay_file_mmap, diff --git a/kernel/resource.c b/kernel/resource.c index 46286434af80..7b9a497419d9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations resource_op = { +static const struct seq_operations resource_op = { .start = r_start, .next = r_next, .stop = r_stop, @@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file) return res; } -static struct file_operations proc_ioports_operations = { +static const struct file_operations proc_ioports_operations = { .open = ioports_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; -static struct file_operations proc_iomem_operations = { +static const struct file_operations proc_iomem_operations = { .open = iomem_open, .read = seq_read, .llseek = seq_lseek, @@ -193,6 +193,13 @@ static int __release_resource(struct resource *old) return -EINVAL; } +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ int request_resource(struct resource *root, struct resource *new) { struct resource *conflict; @@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new) EXPORT_SYMBOL(request_resource); +/** + * ____request_resource - reserve a resource, with resource conflict returned + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns: + * On success, NULL is returned. + * On error, a pointer to the conflicting resource is returned. + */ struct resource *____request_resource(struct resource *root, struct resource *new) { struct resource *conflict; @@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne EXPORT_SYMBOL(____request_resource); +/** + * release_resource - release a previously reserved resource + * @old: resource pointer + */ int release_resource(struct resource *old) { int retval; @@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new, return -EBUSY; } -/* - * Allocate empty slot in the resource tree given range and alignment. +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum size to allocate + * @max: maximum size to allocate + * @align: alignment requested, in bytes + * @alignf: alignment function, optional, called if not NULL + * @alignf_data: arbitrary data to pass to the @alignf function */ int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -344,12 +372,11 @@ EXPORT_SYMBOL(allocate_resource); * * Returns 0 on success, -EBUSY if the resource can't be inserted. * - * This function is equivalent of request_resource when no conflict + * This function is equivalent to request_resource when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become childs of - * the new resource. Otherwise the new resource becomes the child of - * the conflicting resource + * resource is inserted and the conflicting resources become children of + * the new resource. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -357,20 +384,21 @@ int insert_resource(struct resource *parent, struct resource *new) struct resource *first, *next; write_lock(&resource_lock); - begin: - result = 0; - first = __request_resource(parent, new); - if (!first) - goto out; - result = -EBUSY; - if (first == parent) - goto out; + for (;; parent = first) { + result = 0; + first = __request_resource(parent, new); + if (!first) + goto out; - /* Resource fully contained by the clashing resource? Recurse into it */ - if (first->start <= new->start && first->end >= new->end) { - parent = first; - goto begin; + result = -EBUSY; + if (first == parent) + goto out; + + if ((first->start > new->start) || (first->end < new->end)) + break; + if ((first->start == new->start) && (first->end == new->end)) + break; } for (next = first; ; next = next->sibling) { @@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new) return result; } -/* +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * * Given an existing resource, change its start and size to match the - * arguments. Returns -EBUSY if it can't fit. Existing children of - * the resource are assumed to be immutable. + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { @@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource); * Note how this, unlike the above, knows about * the IO flag meanings (busy etc). * - * Request-region creates a new busy region. + * request_region creates a new busy region. * - * Check-region returns non-zero if the area is already busy + * check_region returns non-zero if the area is already busy. * - * Release-region releases a matching busy region. + * release_region releases a matching busy region. + */ + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, @@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent, } return res; } - EXPORT_SYMBOL(__request_region); +/** + * __check_region - check if a resource region is busy or free + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * Returns 0 if the region is free at the moment it is checked, + * returns %-EBUSY if the region is busy. + * + * NOTE: + * This function is deprecated because its use is racy. + * Even if it returns 0, a subsequent call to request_region() + * may fail because another driver etc. just allocated the region. + * Do NOT use it. It will be removed from the kernel. + */ int __check_region(struct resource *parent, resource_size_t start, resource_size_t n) { @@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start, kfree(res); return 0; } - EXPORT_SYMBOL(__check_region); +/** + * __release_region - release a previously reserved resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * The described resource region must match a currently busy region. + */ void __release_region(struct resource *parent, resource_size_t start, resource_size_t n) { @@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start, "<%016llx-%016llx>\n", (unsigned long long)start, (unsigned long long)end); } - EXPORT_SYMBOL(__release_region); /* diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 0c1faa950af7..da8d6bf46457 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -16,7 +16,6 @@ * * See rt.c in preempt-rt for proper credits and further information */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/module.h> diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 948bd8f643e2..015fc633c96c 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,7 +6,6 @@ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * */ -#include <linux/config.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> @@ -14,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/sysdev.h> #include <linux/timer.h> +#include <linux/freezer.h> #include "rtmutex.h" diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 3e13a1e5856f..4ab17da46fd8 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Grab the next task */ task = rt_mutex_owner(lock); + get_task_struct(task); spin_lock_irqsave(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { @@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); @@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; unsigned long flags; - int boost = 0, res; + int chain_walk = 0, res; spin_lock_irqsave(¤t->pi_lock, flags); __rt_mutex_adjust_prio(current); @@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } - spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { - spin_lock_irqsave(&owner->pi_lock, flags); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } + if (owner->pi_blocked_on) + chain_walk = 1; spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!boost) + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + chain_walk = 1; + + if (!chain_walk) return 0; + /* + * The owner can't disappear while holding a lock, + * so the owner struct is protected by wait_lock. + * Gets dropped in rt_mutex_adjust_prio_chain()! + */ + get_task_struct(owner); + spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, @@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock, int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); unsigned long flags; - int boost = 0; + int chain_walk = 0; spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); @@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) { - boost = 1; - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - } + if (owner->pi_blocked_on) + chain_walk = 1; + spin_unlock_irqrestore(&owner->pi_lock, flags); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!boost) + if (!chain_walk) return; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + spin_unlock(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); @@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task) return; } - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); spin_unlock_irqrestore(&task->pi_lock, flags); + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } diff --git a/kernel/sched.c b/kernel/sched.c index 5c848fd4e461..8a0afb97af71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -34,7 +34,7 @@ #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> -#include <linux/suspend.h> +#include <linux/freezer.h> #include <linux/vmalloc.h> #include <linux/blkdev.h> #include <linux/delay.h> @@ -49,7 +49,7 @@ #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/times.h> -#include <linux/acct.h> +#include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> #include <asm/tlb.h> @@ -160,15 +160,6 @@ #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -180,6 +171,15 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + static inline unsigned int task_timeslice(struct task_struct *p) { return static_prio_timeslice(p->static_prio); @@ -225,8 +225,10 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; + unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 14 static int show_schedstat(struct seq_file *seq, void *v) { @@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); } preempt_enable(); #endif @@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file) return res; } -struct file_operations proc_schedstat_operations = { +const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, @@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) #endif /* - * rq_lock - lock a given runqueue and disable interrupts. + * this_rq_lock - lock this runqueue and disable interrupts. */ static inline struct rq *this_rq_lock(void) __acquires(rq->lock) @@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; + if (rt_task(p)) + goto out; + now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; } #endif - if (!rt_task(p)) - p->prio = recalc_task_prio(p, now); + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (now - p->timestamp) >> 20); + } + + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } } p->timestamp = now; - +out: __activate_task(p, rq); } @@ -1232,7 +1250,7 @@ nextgroup: } /* - * find_idlest_queue - find the idlest runqueue among the cpus in group. + * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -1286,21 +1304,29 @@ static int sched_balance_self(int cpu, int flag) while (sd) { cpumask_t span; struct sched_group *group; - int new_cpu; - int weight; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } span = sd->span; group = find_idlest_group(sd, t, cpu); - if (!group) - goto nextlevel; + if (!group) { + sd = sd->child; + continue; + } new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) - goto nextlevel; + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } - /* Now try balancing at a lower domain level */ + /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; -nextlevel: sd = NULL; weight = cpus_weight(span); for_each_domain(cpu, tmp) { @@ -1431,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; - unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + unsigned long tl_per_task; + + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -1669,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1755,27 +1783,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; - unsigned long prev_task_flags; + long prev_state; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". - * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and - * calls schedule one last time. The schedule call will never return, - * and the scheduled task must drop that reference. - * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul <manfred@colorfullife.com> */ - prev_task_flags = prev->flags; + prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) { + if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -1814,14 +1842,14 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; - if (unlikely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; @@ -1933,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { + BUG_ON(!irqs_disabled()); if (rq1 == rq2) { spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ @@ -1972,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -2042,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2079,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->most_recent_timestamp, sd)) return 0; return 1; } @@ -2180,11 +2219,6 @@ skip_queue: goto skip_bitmap; } -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2222,7 +2256,7 @@ out: static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus) + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2251,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load, group_capacity; int local_group; int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + if (local_group) + balance_cpu = first_cpu(group->cpumask); + /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -2270,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, *sd_idle = 0; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + load = target_load(i, load_idx); - else + } else load = source_load(i, load_idx); avg_load += load; @@ -2280,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load += rq->raw_weighted_load; } + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. + */ + if (local_group && balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + total_load += avg_load; total_pwr += group->cpu_power; @@ -2439,18 +2492,21 @@ small_imbalance: pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load*busiest->cpu_power < - busiest_load_per_task*SCHED_LOAD_SCALE) - tmp = max_load*busiest->cpu_power/this->cpu_power; + if (max_load * busiest->cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = max_load * busiest->cpu_power / this->cpu_power; else - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + this->cpu_power; + pwr_move += this->cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2471,8 +2527,8 @@ out_balanced: *imbalance = min_load_per_task; return group_min; } -ret: #endif +ret: *imbalance = 0; return NULL; } @@ -2521,27 +2577,37 @@ static inline unsigned long minus_1_or_zero(unsigned long n) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. - * - * Called with this_rq unlocked. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, + int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + unsigned long flags; + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus); + &cpus, balance); + + if (*balance == 0) + goto out_balanced; + if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2565,11 +2631,13 @@ redo: * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ + local_irq_save(flags); double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { @@ -2586,13 +2654,13 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock(&busiest->lock); + spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } @@ -2602,7 +2670,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -2630,7 +2698,7 @@ redo: } if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return nr_moved; @@ -2646,7 +2714,7 @@ out_one_pinned: sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return 0; } @@ -2668,13 +2736,20 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int sd_idle = 0; cpumask_t cpus = CPU_MASK_ALL; - if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, - &sd_idle, &cpus); + &sd_idle, &cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2709,7 +2784,8 @@ redo: if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; @@ -2719,7 +2795,7 @@ redo: out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; sd->nr_balance_failed = 0; @@ -2733,14 +2809,28 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - if (load_balance_newidle(this_cpu, this_rq, sd)) + pulled_task = load_balance_newidle(this_cpu, + this_rq, sd); + if (time_after(next_balance, + sd->last_balance + sd->balance_interval)) + next_balance = sd->last_balance + + sd->balance_interval; + if (pulled_task) break; } } + if (!pulled_task) + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; } /* @@ -2793,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ - -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) -{ - return jiffies + cpu * HZ / NR_CPUS; -} - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +static void update_load(struct rq *this_rq) { - unsigned long this_load, interval, j = cpu_offset(this_cpu); - struct sched_domain *sd; + unsigned long this_load; int i, scale; this_load = this_rq->raw_weighted_load; @@ -2832,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; } +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static DEFINE_SPINLOCK(balancing); + +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(), balance = 1; + struct rq *this_rq = cpu_rq(this_cpu); + unsigned long interval; + struct sched_domain *sd; + /* + * We are idle if there are no processes running. This + * is valid even if we are the idle process (SMT). + */ + enum idle_type idle = !this_rq->nr_running ? + SCHED_IDLE : NOT_IDLE; + /* Earliest time when we have to call run_rebalance_domains again */ + unsigned long next_balance = jiffies + 60*HZ; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2846,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) if (unlikely(!interval)) interval = 1; - if (j - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (sd->flags & SD_SERIALIZE) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2855,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->last_balance = jiffies; } + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; } + this_rq->next_balance = next_balance; } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT + if (!rq->nr_running) + return; + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -2901,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; } /* @@ -2914,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p) unsigned long flags; local_irq_save(flags); - ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3015,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) +static void task_running_tick(struct rq *rq, struct task_struct *p) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - update_cpu_clock(p, rq, now); - - rq->timestamp_last_tick = now; - - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3111,8 +3201,34 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + update_cpu_clock(p, rq, now); + + if (p == rq->idle) + /* Task on the idle queue */ + wake_priority_sleeper(rq); + else + task_running_tick(rq, p); +#ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +#endif } #ifdef CONFIG_SCHED_SMT @@ -3258,7 +3374,8 @@ void fastcall add_preempt_count(int val) /* * Spinlock count overflowing soon? */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); } EXPORT_SYMBOL(add_preempt_count); @@ -3311,6 +3428,7 @@ asmlinkage void __sched schedule(void) printk(KERN_ERR "BUG: scheduling while atomic: " "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); + debug_show_held_locks(current); dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -3348,9 +3466,6 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; @@ -3472,7 +3587,7 @@ asmlinkage void __sched preempt_schedule(void) * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (unlikely(ti->preempt_count || irqs_disabled())) + if (likely(ti->preempt_count || irqs_disabled())) return; need_resched: @@ -4080,6 +4195,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * NOTE: the task may be already dead */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4107,28 +4224,32 @@ recheck: (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) - != (param->sched_priority == 0)) + if (is_rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - /* - * can't change policy, except between SCHED_NORMAL - * and SCHED_BATCH: - */ - if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && - (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; - /* can't increase priority */ - if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && - param->sched_priority > p->rt_priority && - param->sched_priority > - p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; + if (is_rt_policy(policy)) { + unsigned long rlim_rtprio; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + /* can't change other user's priorities */ if ((current->euid != p->euid) && (current->euid != p->uid)) @@ -4193,14 +4314,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - read_lock_irq(&tasklist_lock); + + rcu_read_lock(); + retval = -ESRCH; p = find_process_by_pid(pid); - if (!p) { - read_unlock_irq(&tasklist_lock); - return -ESRCH; - } - retval = sched_setscheduler(p, policy, &lparam); - read_unlock_irq(&tasklist_lock); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); return retval; } @@ -4382,7 +4502,10 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); #endif long sched_getaffinity(pid_t pid, cpumask_t *mask) @@ -4777,18 +4900,18 @@ static void show_task(struct task_struct *p) show_stack(p, NULL); } -void show_state(void) +void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; #if (BITS_PER_LONG == 32) printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); #else printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -4797,11 +4920,16 @@ void show_state(void) * console might take alot of time: */ touch_nmi_watchdog(); - show_task(p); + if (p->state & state_filter) + show_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); - debug_show_all_locks(); + /* + * Only show locks if all tasks are dumped: + */ + if (state_filter == -1) + debug_show_all_locks(); } /** @@ -4812,7 +4940,7 @@ void show_state(void) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __devinit init_idle(struct task_struct *idle, int cpu) +void __cpuinit init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -4946,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -5023,7 +5151,10 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU -/* Figure out where task on dead CPU should go, use force if neccessary. */ +/* + * Figure out where task on dead CPU should go, use force if neccessary. + * NOTE: interrupts should be disabled by the caller + */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; @@ -5143,6 +5274,7 @@ void idle_task_exit(void) mmdrop(mm); } +/* called under rq->lock with disabled interrupts */ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) { struct rq *rq = cpu_rq(dead_cpu); @@ -5151,7 +5283,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->flags & PF_DEAD); + BUG_ON(p->state == TASK_DEAD); get_task_struct(p); @@ -5159,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. + * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); put_task_struct(p); } @@ -5272,9 +5405,11 @@ static struct notifier_block __cpuinitdata migration_notifier = { int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); + int err; /* Start one for the boot CPU: */ - migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); @@ -5313,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); break; } printk("span %s\n", str); if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); printk(KERN_DEBUG); for (i = 0; i < level + 2; i++) @@ -5337,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!group->cpu_power) { printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); } if (!cpus_weight(group->cpumask)) { @@ -5360,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span " + "domain->span\n"); level++; sd = sd->parent; + if (!sd) + continue; - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); - } + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); } while (sd); } @@ -5385,7 +5526,9 @@ static int sd_degenerate(struct sched_domain *sd) if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5419,7 +5562,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC); + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); } if (~cflags & pflags) return 0; @@ -5441,12 +5586,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) struct sched_domain *parent = tmp->parent; if (!parent) break; - if (sd_parent_degenerate(tmp, parent)) + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } } - if (sd && sd_degenerate(sd)) + if (sd && sd_degenerate(sd)) { sd = sd->parent; + if (sd) + sd->child = NULL; + } sched_domain_debug(sd, cpu); @@ -5454,7 +5605,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -5472,26 +5623,27 @@ static int __init isolated_cpu_setup(char *str) __setup ("isolcpus=", isolated_cpu_setup); /* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS + * (due to the fact that we keep track of groups covered with a cpumask_t). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, - int (*group_fn)(int cpu)) +static void +init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i); - struct sched_group *sg = &groups[group]; + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg); int j; if (cpu_isset(i, covered)) @@ -5501,7 +5653,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j) != group) + if (group_fn(j, cpu_map, NULL) != group) continue; cpu_set(j, covered); @@ -5675,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size); */ static void touch_cache(void *__cache, unsigned long __size) { - unsigned long size = __size/sizeof(long), chunk1 = size/3, - chunk2 = 2*size/3; + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; unsigned long *cache = __cache; int i; @@ -5785,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) */ measure_one(cache, size, cpu1, cpu2); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); measure_one(cache, size, cpu2, cpu1); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); /* * (We measure the non-migrating [cached] cost on both @@ -5799,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) measure_one(cache, size, cpu1, cpu1); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); measure_one(cache, size, cpu2, cpu2); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); /* * Get the per-iteration migration cost: */ - do_div(cost1, 2*ITERATIONS); - do_div(cost2, 2*ITERATIONS); + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); return cost1 - cost2; } @@ -5847,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) */ cache = vmalloc(max_size); if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); return 1000000; /* return 1 msec on very small boxen */ } @@ -5872,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) avg_fluct = (avg_fluct + fluct)/2; if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", cpu1, cpu2, size, (long)cost / 1000000, ((long)cost / 100000) % 10, @@ -5967,7 +6121,7 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - if (system_state == SYSTEM_BOOTING) { + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { printk("migration_cost="); for (distance = 0; distance <= max_distance; distance++) { if (distance) @@ -5978,7 +6132,7 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) } j1 = jiffies; if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0)/HZ); + printk("migration: %ld seconds\n", (j1-j0) / HZ); /* * Move back to the original CPU. NUMA-Q gets confused @@ -6075,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); -static int cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } #endif @@ -6088,34 +6245,52 @@ static int cpu_to_cpu_group(int cpu) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group *sched_group_core_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_core); #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return first_cpu(cpu_sibling_map[cpu]); + int group; + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + group = first_cpu(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group); + return group; } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_core, cpu); return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_phys); -static int cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); - return first_cpu(mask); + cpus_and(mask, mask, *cpu_map); + group = first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) - return first_cpu(cpu_sibling_map[cpu]); + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + group = first_cpu(mask); #else - return cpu; + group = cpu; #endif + if (sg) + *sg = &per_cpu(sched_group_phys, group); + return group; } #ifdef CONFIG_NUMA @@ -6128,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return cpu_to_node(cpu); + cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); + int group; + + cpus_and(nodemask, nodemask, *cpu_map); + group = first_cpu(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group); + return group; } + static void init_numa_sched_groups_power(struct sched_group *group_head) { struct sched_group *sg = group_head; @@ -6162,24 +6347,16 @@ next_sg: } #endif +#ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ static void free_sched_groups(const cpumask_t *cpu_map) { - int cpu; -#ifdef CONFIG_NUMA - int i; + int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - if (!sched_group_nodes) continue; @@ -6204,19 +6381,63 @@ next_sg: kfree(sched_group_nodes); sched_group_nodes_bycpu[cpu] = NULL; } +} +#else +static void free_sched_groups(const cpumask_t *cpu_map) +{ +} #endif - for_each_cpu_mask(cpu, *cpu_map) { - if (sched_group_phys_bycpu[cpu]) { - kfree(sched_group_phys_bycpu[cpu]); - sched_group_phys_bycpu[cpu] = NULL; - } -#ifdef CONFIG_SCHED_MC - if (sched_group_core_bycpu[cpu]) { - kfree(sched_group_core_bycpu[cpu]); - sched_group_core_bycpu[cpu] = NULL; - } -#endif + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != first_cpu(sd->groups->cpumask)) + return; + + child = sd->child; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sd->groups->cpu_power = SCHED_LOAD_SCALE; + return; } + + sd->groups->cpu_power = 0; + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sd->groups->cpu_power += group->cpu_power; + group = group->next; + } while (group != child->groups); } /* @@ -6226,13 +6447,10 @@ next_sg: static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_group *sched_group_phys = NULL; -#ifdef CONFIG_SCHED_MC - struct sched_group *sched_group_core = NULL; -#endif + struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; - struct sched_group *sched_group_allnodes = NULL; + int sd_allnodes = 0; /* * Allocate the per-node list of sched groups @@ -6250,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { - int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); @@ -6259,25 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { - if (!sched_group_allnodes) { - sched_group_allnodes - = kmalloc(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL); - if (!sched_group_allnodes) { - printk(KERN_WARNING - "Can not alloc allnodes sched group\n"); - goto error; - } - sched_group_allnodes_bycpu[i] - = sched_group_allnodes; - } sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i); - sd->groups = &sched_group_allnodes[group]; + cpu_to_allnodes_group(i, cpu_map, &sd->groups); p = sd; + sd_allnodes = 1; } else p = NULL; @@ -6285,61 +6489,40 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + if (p) + p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); #endif - if (!sched_group_phys) { - sched_group_phys - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_phys) { - printk (KERN_WARNING "Can not alloc phys sched" - "group\n"); - goto error; - } - sched_group_phys_bycpu[i] = sched_group_phys; - } - p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; - sd->groups = &sched_group_phys[group]; + if (p) + p->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups); #ifdef CONFIG_SCHED_MC - if (!sched_group_core) { - sched_group_core - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_core) { - printk (KERN_WARNING "Can not alloc core sched" - "group\n"); - goto error; - } - sched_group_core_bycpu[i] = sched_group_core; - } - p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; - sd->groups = &sched_group_core[group]; + p->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; - sd->groups = &sched_group_cpus[group]; + p->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups); #endif } @@ -6351,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(sched_group_cpus, this_sibling_map, - &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); } #endif @@ -6363,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(sched_group_core, this_core_map, - &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); } #endif @@ -6377,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(nodemask)) continue; - init_sched_build_groups(sched_group_phys, nodemask, - &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sched_group_allnodes) - init_sched_build_groups(sched_group_allnodes, *cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6457,82 +6636,30 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; sd = &per_cpu(cpu_domains, i); - sd->groups->cpu_power = SCHED_LOAD_SCALE; + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - int power; - struct sched_domain *sd; sd = &per_cpu(core_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) - * SCHED_LOAD_SCALE / 10; - sd->groups->cpu_power = power; + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); - if (i != first_cpu(sd->groups->cpumask)) - continue; - - sd->groups->cpu_power = 0; - if (sched_mc_power_savings || sched_smt_power_savings) { - int j; - - for_each_cpu_mask(j, sd->groups->cpumask) { - struct sched_domain *sd1; - sd1 = &per_cpu(core_domains, j); - /* - * for each core we will add once - * to the group in physical domain - */ - if (j != first_cpu(sd1->groups->cpumask)) - continue; - - if (sched_smt_power_savings) - sd->groups->cpu_power += sd1->groups->cpu_power; - else - sd->groups->cpu_power += SCHED_LOAD_SCALE; - } - } else - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; -#else - int power; - sd = &per_cpu(phys_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; -#endif + init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); - struct sched_group *sg = &sched_group_allnodes[group]; + if (sd_allnodes) { + struct sched_group *sg; + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); init_numa_sched_groups_power(sg); } #endif @@ -6556,9 +6683,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) return 0; +#ifdef CONFIG_NUMA error: free_sched_groups(cpu_map); return -ENOMEM; +#endif } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. @@ -6702,8 +6831,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif - -#ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing @@ -6736,15 +6863,23 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void __init sched_init_smp(void) { + cpumask_t non_isolated_cpus; + lock_cpu_hotplug(); arch_init_sched_domains(&cpu_online_map); + cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); + if (cpus_empty(non_isolated_cpus)) + cpu_set(smp_processor_id(), non_isolated_cpus); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); } #else void __init sched_init_smp(void) @@ -6803,6 +6938,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); #endif @@ -6837,6 +6976,7 @@ void __might_sleep(char *file, int line) " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); + debug_show_held_locks(current); dump_stack(); } #endif diff --git a/kernel/signal.c b/kernel/signal.c index bfdb5686fa3e..1921ffdc5e77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,6 +23,10 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/capability.h> +#include <linux/freezer.h> +#include <linux/pid_namespace.h> +#include <linux/nsproxy.h> + #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -33,7 +37,7 @@ * SLAB caches for signal bits. */ -static kmem_cache_t *sigqueue_cachep; +static struct kmem_cache *sigqueue_cachep; /* * In POSIX a signal is sent either to a specific thread (Linux task) @@ -267,18 +271,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; + struct user_struct *user; - atomic_inc(&t->user->sigpending); + /* + * In order to avoid problems with "switch_user()", we want to make + * sure that the compiler doesn't re-load "t->user" + */ + user = t->user; + barrier(); + atomic_inc(&user->sigpending); if (override_rlimit || - atomic_read(&t->user->sigpending) <= + atomic_read(&user->sigpending) <= t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) q = kmem_cache_alloc(sigqueue_cachep, flags); if (unlikely(q == NULL)) { - atomic_dec(&t->user->sigpending); + atomic_dec(&user->sigpending); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = get_uid(t->user); + q->user = get_uid(user); } return(q); } @@ -417,9 +428,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info) { - int sig = 0; + int sig = next_signal(pending, mask); - sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -432,9 +442,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, if (!collect_signal(sig, pending, info)) sig = 0; - } - recalc_sigpending(); return sig; } @@ -451,6 +459,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + recalc_sigpending_tsk(tsk); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -577,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || - (current->signal->session != t->signal->session)) + (process_session(current) != process_session(t))) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) @@ -1057,28 +1066,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) } /* - * kill_pg_info() sends a signal to a process group: this is what the tty + * kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) */ -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int retval, success; - if (pgrp <= 0) - return -EINVAL; - success = 0; retval = -ESRCH; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } +int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pgrp_info(sig, info, pgrp); + read_unlock(&tasklist_lock); + + return retval; +} + +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + if (pgrp <= 0) + return -EINVAL; + + return __kill_pgrp_info(sig, info, find_pid(pgrp)); +} + int kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { @@ -1091,8 +1116,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) return retval; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; int acquired_tasklist_lock = 0; @@ -1103,7 +1127,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); @@ -1113,8 +1137,17 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -/* like kill_proc_info(), but doesn't use uid/euid of "current" */ -int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, +static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + rcu_read_lock(); + error = kill_pid_info(sig, info, find_pid(pid)); + rcu_read_unlock(); + return error; +} + +/* like kill_pid_info(), but doesn't use uid/euid of "current" */ +int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, uid_t uid, uid_t euid, u32 secid) { int ret = -EINVAL; @@ -1124,7 +1157,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, return ret; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; @@ -1148,7 +1181,7 @@ out_unlock: read_unlock(&tasklist_lock); return ret; } -EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1266,6 +1299,18 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pgrp); + +int kill_pid(struct pid *pid, int sig, int priv) +{ + return kill_pid_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pid); + int kill_pg(pid_t pgrp, int sig, int priv) { @@ -1835,8 +1880,12 @@ relock: if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; - /* Init gets no signals it doesn't want. */ - if (current == child_reaper) + /* + * Init of a pid space gets no signals it doesn't want from + * within that pid space. It can of course get signals from + * its parent pid space. + */ + if (current == child_reaper(current)) continue; if (sig_kernel_stop(signr)) { @@ -2577,6 +2626,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) } #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + void __init signals_init(void) { sigqueue_cachep = diff --git a/kernel/softirq.c b/kernel/softirq.c index 3789ca98197c..918e52df090e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - BUG_ON(per_cpu(tasklet_vec, hotcpu).list); - BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); @@ -612,7 +610,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = { __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + + BUG_ON(err == NOTIFY_BAD); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 03e6a2b0b787..50afeb813305 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = { __init void spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 9644a41e0bef..2c6c2bf85514 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,17 +21,6 @@ #include <linux/debug_locks.h> #include <linux/module.h> -/* - * Generic declaration of the raw read_trylock() function, - * architectures are supposed to optimize this: - */ -int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) -{ - __raw_read_lock(lock); - return 1; -} -EXPORT_SYMBOL(generic__raw_read_trylock); - int __lockfunc _spin_trylock(spinlock_t *lock) { preempt_disable(); @@ -226,7 +215,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ @@ -248,7 +237,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ - cpu_relax(); \ + _raw_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ @@ -304,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_spin_lock_nested); +unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + /* + * On lockdep we dont want the hand-coded irq-enable of + * _raw_spin_lock_flags() code, because lockdep assumes + * that interrupts are not re-enabled during lock-acquire: + */ +#ifdef CONFIG_PROVE_SPIN_LOCKING + _raw_spin_lock(lock); +#else + _raw_spin_lock_flags(lock, &flags); +#endif + return flags; +} + +EXPORT_SYMBOL(_spin_lock_irqsave_nested); #endif diff --git a/kernel/srcu.c b/kernel/srcu.c new file mode 100644 index 000000000000..3507cabe963b --- /dev/null +++ b/kernel/srcu.c @@ -0,0 +1,258 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney <paulmck@us.ibm.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/srcu.h> + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + sp->completed = 0; + mutex_init(&sp->mutex); + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + return (sp->per_cpu_ref ? 0 : -ENOMEM); +} + +/* + * srcu_readers_active_idx -- returns approximate number of readers + * active on the specified rank of per-CPU counters. + */ + +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + int sum; + + sum = 0; + for_each_possible_cpu(cpu) + sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; + return sum; +} + +/** + * srcu_readers_active - returns approximate number of readers. + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +int srcu_readers_active(struct srcu_struct *sp) +{ + return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int sum; + + sum = srcu_readers_active(sp); + WARN_ON(sum); /* Leakage unless caller handles error. */ + if (sum != 0) + return; + free_percpu(sp->per_cpu_ref); + sp->per_cpu_ref = NULL; +} + +/** + * srcu_read_lock - register a new reader for an SRCU-protected structure. + * @sp: srcu_struct in which to register the new reader. + * + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + preempt_disable(); + idx = sp->completed & 0x1; + barrier(); /* ensure compiler looks -once- at sp->completed. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + preempt_enable(); + return idx; +} + +/** + * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. + * @sp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + preempt_disable(); + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; + preempt_enable(); +} + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchornize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + int idx; + + idx = sp->completed; + mutex_lock(&sp->mutex); + + /* + * Check to see if someone else did the work for us while we were + * waiting to acquire the lock. We need -two- advances of + * the counter, not just one. If there was but one, we might have + * shown up -after- our helper's first synchronize_sched(), thus + * having failed to prevent CPU-reordering races with concurrent + * srcu_read_unlock()s on other CPUs (see comment below). So we + * either (1) wait for two or (2) supply the second ourselves. + */ + + if ((sp->completed - idx) >= 2) { + mutex_unlock(&sp->mutex); + return; + } + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() ensures that any CPU that + * sees the new value of sp->completed will also see any preceding + * changes to data structures made by this CPU. This prevents + * some other CPU from reordering the accesses in its SRCU + * read-side critical section to precede the corresponding + * srcu_read_lock() -- ensuring that such references will in + * fact be protected. + * + * So it is now safe to do the flip. + */ + + idx = sp->completed & 0x1; + sp->completed++; + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * At this point, because of the preceding synchronize_sched(), + * all srcu_read_lock() calls using the old counters have completed. + * Their corresponding critical sections might well be still + * executing, but the srcu_read_lock() primitives themselves + * will have finished executing. + */ + + while (srcu_readers_active_idx(sp, idx)) + schedule_timeout_interruptible(1); + + synchronize_sched(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() forces all srcu_read_unlock() + * primitives that were executing concurrently with the preceding + * for_each_possible_cpu() loop to have completed by this point. + * More importantly, it also forces the corresponding SRCU read-side + * critical sections to have also completed, and the corresponding + * references to SRCU-protected data items to be dropped. + * + * Note: + * + * Despite what you might think at first glance, the + * preceding synchronize_sched() -must- be within the + * critical section ended by the following mutex_unlock(). + * Otherwise, a task taking the early exit can race + * with a srcu_read_unlock(), which might have executed + * just before the preceding srcu_readers_active() check, + * and whose CPU might have reordered the srcu_read_unlock() + * with the preceding critical section. In this case, there + * is nothing preventing the synchronize_sched() task that is + * taking the early exit from freeing a data structure that + * is still being referenced (out of order) by the task + * doing the srcu_read_unlock(). + * + * Alternatively, the comparison with "2" on the early exit + * could be changed to "3", but this increases synchronize_srcu() + * latency for bulk loads. So the current code is preferred. + */ + + mutex_unlock(&sp->mutex); +} + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ + +long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->completed; +} + +EXPORT_SYMBOL_GPL(init_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(srcu_read_lock); +EXPORT_SYMBOL_GPL(srcu_read_unlock); +EXPORT_SYMBOL_GPL(synchronize_srcu); +EXPORT_SYMBOL_GPL(srcu_batches_completed); +EXPORT_SYMBOL_GPL(srcu_readers_active); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 51cacd111dbd..12458040e665 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,6 @@ +/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. + * GPL v2 and any later version. + */ #include <linux/stop_machine.h> #include <linux/kthread.h> #include <linux/sched.h> diff --git a/kernel/sys.c b/kernel/sys.c index 3f894775488d..c7675c1bfdf2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid); */ int C_A_D = 1; -int cad_pid = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); /* * Notifier list for kernel code which wants to be called @@ -152,7 +153,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, /* * Atomic notifier chain routines. Registration and unregistration - * use a mutex, and call_chain is synchronized by RCU (no locks). + * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** @@ -221,7 +222,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; @@ -400,6 +401,129 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +/* + * SRCU notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by SRCU (no locks). + */ + +/** + * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an SRCU notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ + +int srcu_notifier_chain_register(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_register(&nh->head, n); + mutex_unlock(&nh->mutex); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); + +/** + * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an SRCU notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_unregister(&nh->head, n); + mutex_unlock(&nh->mutex); + synchronize_srcu(&nh->srcu); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); + +/** + * srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ + +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + int ret; + int idx; + + idx = srcu_read_lock(&nh->srcu); + ret = notifier_call_chain(&nh->head, val, v); + srcu_read_unlock(&nh->srcu, idx); + return ret; +} + +EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); + +/** + * srcu_init_notifier_head - Initialize an SRCU notifier head + * @nh: Pointer to head of the srcu notifier chain + * + * Unlike other sorts of notifier heads, SRCU notifier heads require + * dynamic initialization. Be sure to call this routine before + * calling any of the other SRCU notifier routines for this head. + * + * If an SRCU notifier head is deallocated, it must first be cleaned + * up by calling srcu_cleanup_notifier_head(). Otherwise the head's + * per-cpu data (used by the SRCU mechanism) will leak. + */ + +void srcu_init_notifier_head(struct srcu_notifier_head *nh) +{ + mutex_init(&nh->mutex); + if (init_srcu_struct(&nh->srcu) < 0) + BUG(); + nh->head = NULL; +} + +EXPORT_SYMBOL_GPL(srcu_init_notifier_head); + /** * register_reboot_notifier - Register function to be called at reboot time * @nb: Info about notifier function to be called @@ -607,12 +731,10 @@ static void kernel_restart_prepare(char *cmd) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - if (!cmd) { + if (!cmd) printk(KERN_EMERG "Restarting system.\n"); - } else { + else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - } - printk(".\n"); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -628,9 +750,8 @@ static void kernel_kexec(void) #ifdef CONFIG_KEXEC struct kimage *image; image = xchg(&kexec_image, NULL); - if (!image) { + if (!image) return; - } kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); @@ -759,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user return 0; } -static void deferred_cad(void *dummy) +static void deferred_cad(struct work_struct *dummy) { kernel_restart(NULL); } @@ -771,15 +892,14 @@ static void deferred_cad(void *dummy) */ void ctrl_alt_del(void) { - static DECLARE_WORK(cad_work, deferred_cad, NULL); + static DECLARE_WORK(cad_work, deferred_cad); if (C_A_D) schedule_work(&cad_work); else - kill_proc(cad_pid, SIGINT, 1); + kill_cad_pid(SIGINT, 1); } - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -824,12 +944,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) (current->sgid == egid) || capable(CAP_SETGID)) new_egid = egid; - else { + else return -EPERM; - } } - if (new_egid != old_egid) - { + if (new_egid != old_egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -858,19 +976,14 @@ asmlinkage long sys_setgid(gid_t gid) if (retval) return retval; - if (capable(CAP_SETGID)) - { - if(old_egid != gid) - { + if (capable(CAP_SETGID)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; - } - else if ((gid == current->gid) || (gid == current->sgid)) - { - if(old_egid != gid) - { + } else if ((gid == current->gid) || (gid == current->sgid)) { + if (old_egid != gid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -901,8 +1014,7 @@ static int set_user(uid_t new_ruid, int dumpclear) switch_uid(new_user); - if(dumpclear) - { + if (dumpclear) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -958,8 +1070,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) return -EAGAIN; - if (new_euid != old_euid) - { + if (new_euid != old_euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -991,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) asmlinkage long sys_setuid(uid_t uid) { int old_euid = current->euid; - int old_ruid, old_suid, new_ruid, new_suid; + int old_ruid, old_suid, new_suid; int retval; retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); if (retval) return retval; - old_ruid = new_ruid = current->uid; + old_ruid = current->uid; old_suid = current->suid; new_suid = old_suid; @@ -1009,8 +1120,7 @@ asmlinkage long sys_setuid(uid_t uid) } else if ((uid != current->uid) && (uid != new_suid)) return -EPERM; - if (old_euid != uid) - { + if (old_euid != uid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1055,8 +1165,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) return -EAGAIN; } if (euid != (uid_t) -1) { - if (euid != current->euid) - { + if (euid != current->euid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1106,8 +1215,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) return -EPERM; } if (egid != (gid_t) -1) { - if (egid != current->egid) - { + if (egid != current->egid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1152,10 +1260,8 @@ asmlinkage long sys_setfsuid(uid_t uid) if (uid == current->uid || uid == current->euid || uid == current->suid || uid == current->fsuid || - capable(CAP_SETUID)) - { - if (uid != old_fsuid) - { + capable(CAP_SETUID)) { + if (uid != old_fsuid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1183,10 +1289,8 @@ asmlinkage long sys_setfsgid(gid_t gid) if (gid == current->gid || gid == current->egid || gid == current->sgid || gid == current->fsgid || - capable(CAP_SETGID)) - { - if (gid != old_fsgid) - { + capable(CAP_SETGID)) { + if (gid != old_fsgid) { current->mm->dumpable = suid_dumpable; smp_wmb(); } @@ -1277,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != group_leader->signal->session) + if (process_session(p) != process_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1293,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) goto out; if (pgid != pid) { - struct task_struct *p; + struct task_struct *g = + find_task_by_pid_type(PIDTYPE_PGID, pgid); - do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == group_leader->signal->session) - goto ok_pgid; - } while_each_task_pid(pgid, PIDTYPE_PGID, p); - goto out; + if (!g || process_session(g) != process_session(group_leader)) + goto out; } -ok_pgid: err = security_task_setpgid(p, pgid); if (err) goto out; @@ -1322,9 +1423,9 @@ out: asmlinkage long sys_getpgid(pid_t pid) { - if (!pid) { + if (!pid) return process_group(current); - } else { + else { int retval; struct task_struct *p; @@ -1354,9 +1455,9 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { - if (!pid) { - return current->signal->session; - } else { + if (!pid) + return process_session(current); + else { int retval; struct task_struct *p; @@ -1364,10 +1465,10 @@ asmlinkage long sys_getsid(pid_t pid) p = find_task_by_pid(pid); retval = -ESRCH; - if(p) { + if (p) { retval = security_task_getsid(p); if (!retval) - retval = p->signal->session; + retval = process_session(p); } read_unlock(&tasklist_lock); return retval; @@ -1380,7 +1481,6 @@ asmlinkage long sys_setsid(void) pid_t session; int err = -EPERM; - mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ @@ -1400,12 +1500,15 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(session, session); + + spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; + spin_unlock(&group_leader->sighand->siglock); + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - mutex_unlock(&tty_mutex); return err; } @@ -1432,9 +1535,9 @@ struct group_info *groups_alloc(int gidsetsize) group_info->nblocks = nblocks; atomic_set(&group_info->usage, 1); - if (gidsetsize <= NGROUPS_SMALL) { + if (gidsetsize <= NGROUPS_SMALL) group_info->blocks[0] = group_info->small_block; - } else { + else { for (i = 0; i < nblocks; i++) { gid_t *b; b = (void *)__get_free_page(GFP_USER); @@ -1490,7 +1593,7 @@ static int groups_to_user(gid_t __user *grouplist, /* fill a group_info from a user-space array - it must be allocated already */ static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) - { +{ int i; int count = group_info->ngroups; @@ -1648,9 +1751,8 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) int in_group_p(gid_t grp) { int retval = 1; - if (grp != current->fsgid) { + if (grp != current->fsgid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1659,9 +1761,8 @@ EXPORT_SYMBOL(in_group_p); int in_egroup_p(gid_t grp) { int retval = 1; - if (grp != current->egid) { + if (grp != current->egid) retval = groups_search(current->group_info, grp); - } return retval; } @@ -1676,7 +1777,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) int errno = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1694,8 +1795,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.nodename, tmp, len); - system_utsname.nodename[len] = 0; + memcpy(utsname()->nodename, tmp, len); + utsname()->nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1711,11 +1812,11 @@ asmlinkage long sys_gethostname(char __user *name, int len) if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + i = 1 + strlen(utsname()->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, utsname()->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1740,8 +1841,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.domainname, tmp, len); - system_utsname.domainname[len] = 0; + memcpy(utsname()->domainname, tmp, len); + utsname()->domainname[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1776,9 +1877,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); - if(x.rlim_cur > 0x7FFFFFFF) + if (x.rlim_cur > 0x7FFFFFFF) x.rlim_cur = 0x7FFFFFFF; - if(x.rlim_max > 0x7FFFFFFF) + if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; } @@ -2084,12 +2185,12 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, * padding */ unsigned long t0, t1; - get_user(t0, &cache->t0); - get_user(t1, &cache->t1); + get_user(t0, &cache->blob[0]); + get_user(t1, &cache->blob[1]); t0++; t1++; - put_user(t0, &cache->t0); - put_user(t1, &cache->t1); + put_user(t0, &cache->blob[0]); + put_user(t1, &cache->blob[1]); } return err ? -EFAULT : 0; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6991bece67e8..d7306d0f3dfc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); +cond_syscall(sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); @@ -134,3 +135,9 @@ cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); +cond_syscall(compat_sys_migrate_pages); + +/* block-layer dependent */ +cond_syscall(sys_bdflush); +cond_syscall(sys_ioprio_set); +cond_syscall(sys_ioprio_get); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bcb3a181dbb2..130c5ec9ee0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -52,6 +52,11 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_X86 +#include <asm/nmi.h> +#include <asm/stacktrace.h> +#endif + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -64,7 +69,6 @@ extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; -extern int cad_pid; extern int pid_max; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; @@ -74,13 +78,6 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) -int unknown_nmi_panic; -int nmi_watchdog_enabled; -extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, - void __user *, size_t *, loff_t *); -#endif - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; @@ -95,13 +92,10 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_ctlmax; -extern size_t shm_ctlall; -extern int shm_ctlmni; -extern int msg_ctlmax; -extern int msg_ctlmnb; -extern int msg_ctlmni; -extern int sem_ctls[]; +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif #ifdef __sparc__ @@ -137,10 +131,28 @@ extern int no_unaligned_warning; extern int max_lock_depth; #endif -static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, - ctl_table *, void **); -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifdef CONFIG_SYSCTL_SYSCALL +static int parse_table(int __user *, int, void __user *, size_t __user *, + void __user *, size_t, ctl_table *); +#endif + +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); + +#ifdef CONFIG_SYSVIPC +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); +#endif + +#ifdef CONFIG_PROC_SYSCTL +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +#endif static ctl_table root_table[]; static struct ctl_table_header root_table_header = @@ -163,15 +175,49 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; +#ifdef CONFIG_UTS_NS + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; +#endif + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + +#ifdef CONFIG_SYSVIPC +static void *get_ipc(ctl_table *table, int write) +{ + char *which = table->data; + struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; + which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; + return which; +} +#else +#define get_ipc(T,W) ((T)->data) +#endif + /* /proc declarations: */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); static int proc_opensys(struct inode *, struct file *); -struct file_operations proc_sys_file_operations = { +const struct file_operations proc_sys_file_operations = { .open = proc_opensys, .read = proc_readsys, .write = proc_writesys, @@ -232,47 +278,47 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_OSTYPE, .procname = "ostype", - .data = system_utsname.sysname, - .maxlen = sizeof(system_utsname.sysname), + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_OSRELEASE, .procname = "osrelease", - .data = system_utsname.release, - .maxlen = sizeof(system_utsname.release), + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_VERSION, .procname = "version", - .data = system_utsname.version, - .maxlen = sizeof(system_utsname.version), + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_NODENAME, .procname = "hostname", - .data = system_utsname.nodename, - .maxlen = sizeof(system_utsname.nodename), + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_DOMAINNAME, .procname = "domainname", - .data = system_utsname.domainname, - .maxlen = sizeof(system_utsname.domainname), + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, - .proc_handler = &proc_doutsstring, - .strategy = &sysctl_string, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_uts_string, }, { .ctl_name = KERN_PANIC, @@ -294,7 +340,7 @@ static ctl_table kern_table[] = { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, - .maxlen = 64, + .maxlen = 128, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, @@ -432,58 +478,65 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = &shm_ctlmax, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlmax, + .maxlen = sizeof (init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = &shm_ctlall, - .maxlen = sizeof (size_t), + .data = &init_ipc_ns.shm_ctlall, + .maxlen = sizeof (init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_ipc_doulongvec_minmax, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = &shm_ctlmni, - .maxlen = sizeof (int), + .data = &init_ipc_ns.shm_ctlmni, + .maxlen = sizeof (init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = &msg_ctlmax, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmax, + .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = &msg_ctlmni, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmni, + .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = &msg_ctlmnb, - .maxlen = sizeof (int), + .data = &init_ipc_ns.msg_ctlmnb, + .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = &sem_ctls, + .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_ipc_dointvec, + .strategy = sysctl_ipc_data, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -496,14 +549,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_SYSCTL { .ctl_name = KERN_CADPID, .procname = "cad_pid", - .data = &cad_pid, + .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_cad_pid, }, +#endif { .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", @@ -657,6 +712,14 @@ static ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kstack_depth_to_print", + .data = &kstack_depth_to_print, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif #if defined(CONFIG_MMU) { @@ -927,17 +990,6 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif -#ifdef CONFIG_SWAP - { - .ctl_name = VM_SWAP_TOKEN_TIMEOUT, - .procname = "swap_token_timeout", - .data = &swap_token_default_timeout, - .maxlen = sizeof(swap_token_default_timeout), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, -#endif #ifdef CONFIG_NUMA { .ctl_name = VM_ZONE_RECLAIM_MODE, @@ -1166,12 +1218,13 @@ static void start_unregistering(struct ctl_table_header *p) void __init sysctl_init(void) { -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL register_proc_table(root_table, proc_sys_root, &root_table_header); init_irq_proc(); #endif } +#ifdef CONFIG_SYSCTL_SYSCALL int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1190,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol do { struct ctl_table_header *head = list_entry(tmp, struct ctl_table_header, ctl_entry); - void *context = NULL; if (!use_table(head)) continue; @@ -1198,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol spin_unlock(&sysctl_lock); error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, head->ctl_table, - &context); - kfree(context); + newval, newlen, head->ctl_table); spin_lock(&sysctl_lock); unuse_table(head); @@ -1225,6 +1275,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) unlock_kernel(); return error; } +#endif /* CONFIG_SYSCTL_SYSCALL */ /* * ctl_perm does NOT grant the superuser all rights automatically, because @@ -1251,10 +1302,11 @@ static inline int ctl_perm(ctl_table *table, int op) return test_perm(table->mode, op); } +#ifdef CONFIG_SYSCTL_SYSCALL static int parse_table(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, - ctl_table *table, void **context) + ctl_table *table) { int n; repeat: @@ -1262,7 +1314,9 @@ repeat: return -ENOTDIR; if (get_user(n, name)) return -EFAULT; - for ( ; table->ctl_name; table++) { + for ( ; table->ctl_name || table->procname; table++) { + if (!table->ctl_name) + continue; if (n == table->ctl_name || table->ctl_name == CTL_ANY) { int error; if (table->child) { @@ -1272,7 +1326,7 @@ repeat: error = table->strategy( table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (error) return error; } @@ -1283,7 +1337,7 @@ repeat: } error = do_sysctl_strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); return error; } } @@ -1294,7 +1348,7 @@ repeat: int do_sysctl_strategy (ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { int op = 0, rc; size_t len; @@ -1308,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table, if (table->strategy) { rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen, context); + newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1340,6 +1394,7 @@ int do_sysctl_strategy (ctl_table *table, } return 0; } +#endif /* CONFIG_SYSCTL_SYSCALL */ /** * register_sysctl_table - register a sysctl hierarchy @@ -1427,7 +1482,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, else list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); spin_unlock(&sysctl_lock); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL register_proc_table(table, proc_sys_root, tmp); #endif return tmp; @@ -1445,18 +1500,31 @@ void unregister_sysctl_table(struct ctl_table_header * header) might_sleep(); spin_lock(&sysctl_lock); start_unregistering(header); -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL unregister_proc_table(header->ctl_table, proc_sys_root); #endif spin_unlock(&sysctl_lock); kfree(header); } +#else /* !CONFIG_SYSCTL */ +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + return NULL; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +#endif /* CONFIG_SYSCTL */ + /* * /proc/sys support */ -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL /* Scan the sysctl entries in table and add them all into /proc */ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) @@ -1465,7 +1533,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, int len; mode_t mode; - for (; table->ctl_name; table++) { + for (; table->ctl_name || table->procname; table++) { /* Can't do anything without a proc name. */ if (!table->procname) continue; @@ -1512,7 +1580,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) { struct proc_dir_entry *de; - for (; table->ctl_name; table++) { + for (; table->ctl_name || table->procname; table++) { if (!(de = table->de)) continue; if (de->mode & S_IFDIR) { @@ -1547,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, size_t count, loff_t *ppos) { int op; - struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); + struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); struct ctl_table *table; size_t res; ssize_t error = -ENOTDIR; @@ -1607,32 +1675,15 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf, return do_rw_proc(1, file, (char __user *) buf, count, ppos); } -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int _proc_do_string(void* data, int maxlen, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) { size_t len; char __user *p; char c; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; @@ -1648,20 +1699,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, break; len++; } - if (len >= table->maxlen) - len = table->maxlen-1; - if(copy_from_user(table->data, buffer, len)) + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) table->data)[len] = 0; + ((char *) data)[len] = 0; *ppos += *lenp; } else { - len = strlen(table->data); - if (len > table->maxlen) - len = table->maxlen; + len = strlen(data); + if (len > maxlen) + len = maxlen; if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, table->data, len)) + if(copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { if(put_user('\n', ((char __user *) buffer) + len)) @@ -1674,25 +1725,44 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return 0; } +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, filp, + buffer, lenp, ppos); +} + /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ - -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; - - if (!write) { - down_read(&uts_sem); - r=proc_dostring(table,0,filp,buffer,lenp, ppos); - up_read(&uts_sem); - } else { - down_write(&uts_sem); - r=proc_dostring(table,1,filp,buffer,lenp, ppos); - up_write(&uts_sem); - } + void *which; + which = get_uts(table, write); + r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); + put_uts(table, write, which); return r; } @@ -1715,8 +1785,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_dointvec(void *tbl_data, ctl_table *table, + int write, struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -1729,13 +1800,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (int *) table->data; + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -1765,7 +1836,7 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, p = buf; if (*p == '-' && left > 1) { neg = 1; - left--, p++; + p++; } if (*p < '0' || *p > '9') break; @@ -1824,6 +1895,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, #undef TMPBUFLEN } +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, filp, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1847,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, #define OP_SET 0 #define OP_AND 1 -#define OP_OR 2 -#define OP_MAX 3 -#define OP_MIN 4 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1861,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, switch(op) { case OP_SET: *valp = val; break; case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - case OP_MAX: if(*valp < val) - *valp = val; - break; - case OP_MIN: if(*valp > val) - *valp = val; - break; } } else { int val = *valp; @@ -1895,7 +1966,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, return -EPERM; } - op = (current->pid == 1) ? OP_SET : OP_AND; + op = is_init(current) ? OP_SET : OP_AND; return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, do_proc_dointvec_bset_conv,&op); } @@ -1957,7 +2028,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, do_proc_dointvec_minmax_conv, ¶m); } -static int do_proc_doulongvec_minmax(ctl_table *table, int write, +static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -1971,13 +2042,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (unsigned long *) table->data; + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); @@ -2006,7 +2077,7 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, p = buf; if (*p == '-' && left > 1) { neg = 1; - left--, p++; + p++; } if (*p < '0' || *p > '9') break; @@ -2062,6 +2133,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, #undef TMPBUFLEN } +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + filp, buffer, lenp, ppos, convmul, convdiv); +} + /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table @@ -2250,6 +2332,49 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } +#ifdef CONFIG_SYSVIPC +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *which; + which = get_ipc(table, write); + return __do_proc_dointvec(which, table, write, filp, buffer, + lenp, ppos, NULL, NULL); +} + +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *which; + which = get_ipc(table, write); + return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, + lenp, ppos, 1l, 1l); +} + +#endif + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_nr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, @@ -2258,11 +2383,30 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) { return -ENOSYS; } +#endif int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2318,6 +2462,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SYSCTL_SYSCALL /* * General sysctl support routines */ @@ -2325,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, /* The generic string strategy routine: */ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (!table->data || !table->maxlen) return -ENOTDIR; @@ -2371,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, */ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (newval && newlen) { @@ -2407,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2435,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, /* Strategy function to convert jiffies to seconds */ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { if (oldval) { size_t olen; @@ -2460,109 +2605,141 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return 1; } -#else /* CONFIG_SYSCTL */ - - -asmlinkage long sys_sysctl(struct __sysctl_args __user *args) -{ - return -ENOSYS; -} -int sysctl_string(ctl_table *table, int __user *name, int nlen, +/* The generic string strategy routine: */ +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) -{ - return -ENOSYS; + void __user *newval, size_t newlen) +{ + struct ctl_table uts_table; + int r, write; + write = newval && newlen; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = sysctl_string(&uts_table, name, nlen, + oldval, oldlenp, newval, newlen); + put_uts(table, write, uts_table.data); + return r; } -int sysctl_intvec(ctl_table *table, int __user *name, int nlen, +#ifdef CONFIG_SYSVIPC +/* The generic sysctl ipc data routine. */ +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) + void __user *newval, size_t newlen) { - return -ENOSYS; -} + size_t len; + void *data; -int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) -{ - return -ENOSYS; -} + /* Get out of I don't have a variable */ + if (!table->data || !table->maxlen) + return -ENOTDIR; -int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, void **context) -{ - return -ENOSYS; -} + data = get_ipc(table, 1); + if (!data) + return -ENOTDIR; -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, data, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } -int proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} + if (newval && newlen) { + if (newlen > table->maxlen) + newlen = table->maxlen; -int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; + if (copy_from_user(data, newval, newlen)) + return -EFAULT; + } + return 1; } +#endif -int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} +#else /* CONFIG_SYSCTL_SYSCALL */ -int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) { + static int msg_count; + struct __sysctl_args tmp; + int name[CTL_MAXNAME]; + int i; + + /* Read in the sysctl name for better debug message logging */ + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME) + return -ENOTDIR; + for (i = 0; i < tmp.nlen; i++) + if (get_user(name[i], tmp.name + i)) + return -EFAULT; + + /* Ignore accesses to kernel.version */ + if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) + goto out; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the removed sysctl " + "system call with ", current->comm); + for (i = 0; i < tmp.nlen; i++) + printk("%d.", name[i]); + printk("\n"); + } +out: return -ENOSYS; } -int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { return -ENOSYS; } -int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { return -ENOSYS; } -int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, - struct file *filp, - void __user *buffer, - size_t *lenp, loff_t *ppos) +int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { - return -ENOSYS; + return -ENOSYS; } -struct ctl_table_header * register_sysctl_table(ctl_table * table, - int insert_at_head) +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { - return NULL; + return -ENOSYS; } - -void unregister_sysctl_table(struct ctl_table_header * table) +static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) { + return -ENOSYS; } - -#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SYSCTL_SYSCALL */ /* * No sense putting this after each symbol definition, twice, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 2ed4040d0dc5..4c3476fa058d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,7 +18,9 @@ #include <linux/kernel.h> #include <linux/taskstats_kern.h> +#include <linux/tsacct_kern.h> #include <linux/delayacct.h> +#include <linux/tsacct_kern.h> #include <linux/cpumask.h> #include <linux/percpu.h> #include <net/genetlink.h> @@ -32,7 +34,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; -kmem_cache_t *taskstats_cache; +struct kmem_cache *taskstats_cache; static struct genl_family family = { .id = GENL_ID_GENERATE, @@ -67,7 +69,7 @@ enum actions { }; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, - void **replyp, size_t size) + size_t size) { struct sk_buff *skb; void *reply; @@ -75,7 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * If new attributes are added, please revisit this allocation */ - skb = nlmsg_new(size, GFP_KERNEL); + skb = genlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; @@ -83,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, int seq = get_cpu_var(taskstats_seqnum)++; put_cpu_var(taskstats_seqnum); - reply = genlmsg_put(skb, 0, seq, - family.id, 0, 0, - cmd, family.version); + reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); } else - reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, - family.id, 0, 0, - cmd, family.version); + reply = genlmsg_put_reply(skb, info, &family, 0, cmd); if (reply == NULL) { nlmsg_free(skb); return -EINVAL; } *skbp = skb; - *replyp = reply; return 0; } @@ -121,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid) /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ -static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +static void send_cpu_listeners(struct sk_buff *skb, + struct listener_list *listeners) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); - struct listener_list *listeners; struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); @@ -137,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } rc = 0; - listeners = &per_cpu(listener_array, cpu); down_read(&listeners->sem); list_for_each_entry(s, &listeners->list, list) { skb_next = NULL; @@ -172,24 +168,23 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *pidtsk, +static int fill_pid(pid_t pid, struct task_struct *tsk, struct taskstats *stats) { int rc = 0; - struct task_struct *tsk = pidtsk; - if (!pidtsk) { - read_lock(&tasklist_lock); + if (!tsk) { + rcu_read_lock(); tsk = find_task_by_pid(pid); - if (!tsk) { - read_unlock(&tasklist_lock); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) return -ESRCH; - } - get_task_struct(tsk); - read_unlock(&tasklist_lock); } else get_task_struct(tsk); + memset(stats, 0, sizeof(*stats)); /* * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows @@ -198,7 +193,13 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, */ delayacct_add_tsk(stats, tsk); + + /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; + bacct_add_tsk(stats, tsk); + + /* fill in extended acct fields */ + xacct_add_tsk(stats, tsk); /* Define err: label here if needed */ put_task_struct(tsk); @@ -206,39 +207,32 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, } -static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, +static int fill_tgid(pid_t tgid, struct task_struct *first, struct taskstats *stats) { - struct task_struct *tsk, *first; + struct task_struct *tsk; unsigned long flags; + int rc = -ESRCH; /* * Add additional stats from live tasks except zombie thread group * leaders who are already counted with the dead tasks */ - first = tgidtsk; - if (!first) { - read_lock(&tasklist_lock); + rcu_read_lock(); + if (!first) first = find_task_by_pid(tgid); - if (!first) { - read_unlock(&tasklist_lock); - return -ESRCH; - } - get_task_struct(first); - read_unlock(&tasklist_lock); - } else - get_task_struct(first); - /* Start with stats from dead tasks */ - spin_lock_irqsave(&first->signal->stats_lock, flags); + if (!first || !lock_task_sighand(first, &flags)) + goto out; + if (first->signal->stats) memcpy(stats, first->signal->stats, sizeof(*stats)); - spin_unlock_irqrestore(&first->signal->stats_lock, flags); + else + memset(stats, 0, sizeof(*stats)); tsk = first; - read_lock(&tasklist_lock); do { - if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + if (tsk->exit_state) continue; /* * Accounting subsystem can call its functions here to @@ -249,15 +243,18 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); - read_unlock(&tasklist_lock); - stats->version = TASKSTATS_VERSION; + unlock_task_sighand(first, &flags); + rc = 0; +out: + rcu_read_unlock(); + + stats->version = TASKSTATS_VERSION; /* * Accounting subsytems can also add calls here to modify * fields of taskstats. */ - - return 0; + return rc; } @@ -265,7 +262,7 @@ static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); + spin_lock_irqsave(&tsk->sighand->siglock, flags); if (!tsk->signal->stats) goto ret; @@ -277,7 +274,7 @@ static void fill_tgid_exit(struct task_struct *tsk) */ delayacct_add_tsk(tsk->signal->stats, tsk); ret: - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return; } @@ -348,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask) return ret; } +static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) +{ + struct nlattr *na, *ret; + int aggr; + + aggr = (type == TASKSTATS_TYPE_PID) + ? TASKSTATS_TYPE_AGGR_PID + : TASKSTATS_TYPE_AGGR_TGID; + + na = nla_nest_start(skb, aggr); + if (!na) + goto err; + if (nla_put(skb, type, sizeof(pid), &pid) < 0) + goto err; + ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); + if (!ret) + goto err; + nla_nest_end(skb, na); + + return nla_data(ret); +err: + return NULL; +} + static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; - struct taskstats stats; - void *reply; + struct taskstats *stats; size_t size; - struct nlattr *na; cpumask_t mask; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); @@ -376,146 +395,122 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - memset(&stats, 0, sizeof(stats)); - rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return rc; + rc = -EINVAL; if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - rc = fill_pid(pid, NULL, &stats); - if (rc < 0) + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); + if (!stats) goto err; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - stats); + rc = fill_pid(pid, NULL, stats); + if (rc < 0) + goto err; } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - rc = fill_tgid(tgid, NULL, &stats); - if (rc < 0) + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); + if (!stats) goto err; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - stats); - } else { - rc = -EINVAL; + rc = fill_tgid(tgid, NULL, stats); + if (rc < 0) + goto err; + } else goto err; - } - - nla_nest_end(rep_skb, na); return send_reply(rep_skb, info->snd_pid); - -nla_put_failure: - return genlmsg_cancel(rep_skb, reply); err: nlmsg_free(rep_skb); return rc; } -void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { - struct listener_list *listeners; - struct taskstats *tmp; - /* - * This is the cpu on which the task is exiting currently and will - * be the one for which the exit event is sent, even if the cpu - * on which this function is running changes later. - */ - *mycpu = raw_smp_processor_id(); + struct signal_struct *sig = tsk->signal; + struct taskstats *stats; - *ptidstats = NULL; - tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); - if (!tmp) - return; + if (sig->stats || thread_group_empty(tsk)) + goto ret; - listeners = &per_cpu(listener_array, *mycpu); - down_read(&listeners->sem); - if (!list_empty(&listeners->list)) { - *ptidstats = tmp; - tmp = NULL; + /* No problem if kmem_cache_zalloc() fails */ + stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + + spin_lock_irq(&tsk->sighand->siglock); + if (!sig->stats) { + sig->stats = stats; + stats = NULL; } - up_read(&listeners->sem); - kfree(tmp); + spin_unlock_irq(&tsk->sighand->siglock); + + if (stats) + kmem_cache_free(taskstats_cache, stats); +ret: + return sig->stats; } /* Send pid data out on exit */ -void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead, unsigned int mycpu) +void taskstats_exit(struct task_struct *tsk, int group_dead) { int rc; + struct listener_list *listeners; + struct taskstats *stats; struct sk_buff *rep_skb; - void *reply; size_t size; int is_thread_group; - struct nlattr *na; - unsigned long flags; - if (!family_registered || !tidstats) + if (!family_registered) return; - spin_lock_irqsave(&tsk->signal->stats_lock, flags); - is_thread_group = tsk->signal->stats ? 1 : 0; - spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); - - rc = 0; /* * Size includes space for nested attributes */ size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); - if (is_thread_group) - size = 2 * size; /* PID + STATS + TGID + STATS */ + is_thread_group = !!taskstats_tgid_alloc(tsk); + if (is_thread_group) { + /* PID + STATS + TGID + STATS */ + size = 2 * size; + /* fill the tsk->signal->stats structure */ + fill_tgid_exit(tsk); + } - rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); - if (rc < 0) - goto ret; + listeners = &__raw_get_cpu_var(listener_array); + if (list_empty(&listeners->list)) + return; - rc = fill_pid(tsk->pid, tsk, tidstats); + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) - goto err_skb; + return; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tidstats); - nla_nest_end(rep_skb, na); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); + if (!stats) + goto err; - if (!is_thread_group) - goto send; + rc = fill_pid(tsk->pid, tsk, stats); + if (rc < 0) + goto err; /* - * tsk has/had a thread group so fill the tsk->signal->stats structure * Doesn't matter if tsk is the leader or the last group member leaving */ - - fill_tgid_exit(tsk); - if (!group_dead) + if (!is_thread_group || !group_dead) goto send; - na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); - NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); - /* No locking needed for tsk->signal->stats since group is dead */ - NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tsk->signal->stats); - nla_nest_end(rep_skb, na); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); + if (!stats) + goto err; + + memcpy(stats, tsk->signal->stats, sizeof(*stats)); send: - send_cpu_listeners(rep_skb, mycpu); + send_cpu_listeners(rep_skb, listeners); return; - -nla_put_failure: - genlmsg_cancel(rep_skb, reply); - goto ret; -err_skb: +err: nlmsg_free(rep_skb); -ret: - return; } static struct genl_ops taskstats_ops = { diff --git a/kernel/time.c b/kernel/time.c index 5bd489747643..0e017bff4c19 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -202,179 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -/* we call this to notify the arch when the clock is being - * controlled. If no such arch routine, do nothing. - */ -void __attribute__ ((weak)) notify_arch_cmos_timer(void) -{ - return; -} - -/* adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - long ltemp, mtemp, save_adjust; - int result; - - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - /* singleshot must not be used with any other mode bits */ - if (txc->modes != ADJ_OFFSET_SINGLESHOT) - return -EINVAL; - - if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) - /* adjustment Offset limited to +- .512 seconds */ - if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) - return -EINVAL; - - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - result = time_state; /* mostly `TIME_OK' */ - - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_next_adjust ? time_next_adjust : time_adjust; - -#if 0 /* STA_CLOCKERR is never set yet */ - time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ -#endif - /* If there are input parameters, then process them */ - if (txc->modes) - { - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ - time_status = (txc->status & ~STA_RONLY) | - (time_status & STA_RONLY); - - if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ - if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - result = -EINVAL; - goto leave; - } - time_freq = txc->freq; - } - - if (txc->modes & ADJ_MAXERROR) { - if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_maxerror = txc->maxerror; - } - - if (txc->modes & ADJ_ESTERROR) { - if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_esterror = txc->esterror; - } - - if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ - if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - result = -EINVAL; - goto leave; - } - time_constant = txc->constant; - } - - if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ - if (txc->modes == ADJ_OFFSET_SINGLESHOT) { - /* adjtime() is independent from ntp_adjtime() */ - if ((time_next_adjust = txc->offset) == 0) - time_adjust = 0; - } - else if (time_status & STA_PLL) { - ltemp = txc->offset; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - if (ltemp > MAXPHASE) - time_offset = MAXPHASE << SHIFT_UPDATE; - else if (ltemp < -MAXPHASE) - time_offset = -(MAXPHASE << SHIFT_UPDATE); - else - time_offset = ltemp << SHIFT_UPDATE; - - /* - * Select whether the frequency is to be controlled - * and in which mode (PLL or FLL). Clamp to the operating - * range. Ugly multiply/divide should be replaced someday. - */ - - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - if (time_status & STA_FLL) { - if (mtemp >= MINSEC) { - ltemp = (time_offset / mtemp) << (SHIFT_USEC - - SHIFT_UPDATE); - time_freq += shift_right(ltemp, SHIFT_KH); - } else /* calibration interval too short (p. 12) */ - result = TIME_ERROR; - } else { /* PLL mode */ - if (mtemp < MAXSEC) { - ltemp *= mtemp; - time_freq += shift_right(ltemp,(time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC)); - } else /* calibration interval too long (p. 12) */ - result = TIME_ERROR; - } - time_freq = min(time_freq, time_tolerance); - time_freq = max(time_freq, -time_tolerance); - } /* STA_PLL */ - } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) { - tick_usec = txc->tick; - tick_nsec = TICK_USEC_TO_NSEC(tick_usec); - } - } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) - result = TIME_ERROR; - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset, SHIFT_UPDATE); - } - txc->freq = time_freq; - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = time_precision; - txc->tolerance = time_tolerance; - txc->tick = tick_usec; - - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; - write_sequnlock_irq(&xtime_lock); - do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); - return(result); -} - asmlinkage long sys_adjtimex(struct timex __user *txc_p) { struct timex txc; /* Local copy of parameter */ diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e1dfd8e86cce..61a3907d16fb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1 @@ -obj-y += clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca5939bd9..22504afc0d34 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) /* check if clocksource is already registered */ if (is_registered_source(c)) { printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); + "Already registered!", c->name); ret = -EBUSY; } else { /* register it */ @@ -186,6 +186,7 @@ void clocksource_reselect(void) } EXPORT_SYMBOL(clocksource_reselect); +#ifdef CONFIG_SYSFS /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused @@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) * Sysfs setup bits: */ static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, - sysfs_override_clocksource); + sysfs_override_clocksource); static SYSDEV_ATTR(available_clocksource, 0600, - sysfs_show_available_clocksources, NULL); + sysfs_show_available_clocksources, NULL); static struct sysdev_class clocksource_sysclass = { set_kset_name("clocksource"), @@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) } device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ /** * boot_override_clocksource - boot clock override diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 126bb30c4afe..a99b2a6e6a07 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -57,7 +57,7 @@ static cycle_t jiffies_read(void) struct clocksource clocksource_jiffies = { .name = "jiffies", - .rating = 0, /* lowest rating*/ + .rating = 1, /* lowest valid rating*/ .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 000000000000..3afeaa3a73f9 --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,350 @@ +/* + * linux/kernel/time/ntp.c + * + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ + +#include <linux/mm.h> +#include <linux/time.h> +#include <linux/timex.h> + +#include <asm/div64.h> +#include <asm/timex.h> + +/* + * Timekeeping variables + */ +unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ +unsigned long tick_nsec; /* ACTHZ period (nsec) */ +static u64 tick_length, tick_length_base; + +#define MAX_TICKADJ 500 /* microsecs */ +#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ + TICK_LENGTH_SHIFT) / HZ) + +/* + * phase-lock loop variables + */ +/* TIME_ERROR prevents overwriting the CMOS clock */ +static int time_state = TIME_OK; /* clock synchronization status */ +int time_status = STA_UNSYNC; /* clock status bits */ +static long time_offset; /* time adjustment (ns) */ +static long time_constant = 2; /* pll time constant */ +long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +long time_freq; /* frequency offset (scaled ppm)*/ +static long time_reftime; /* time at last adjustment (s) */ +long time_adjust; + +#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) +#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \ + (s64)CLOCK_TICK_RATE) + +static void ntp_update_frequency(void) +{ + tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; + tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + + do_div(tick_length_base, HZ); + + tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; +} + +/** + * ntp_clear - Clears the NTP state variables + * + * Must be called while holding a write on the xtime_lock + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + */ +void second_overflow(void) +{ + long time_adj; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + + /* + * Compute the phase adjustment for the next second. The offset is + * reduced by a fixed factor times the time constant. + */ + tick_length = tick_length_base; + time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= time_adj; + tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); + + if (unlikely(time_adjust)) { + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + } else if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + } else { + tick_length += (s64)(time_adjust * NSEC_PER_USEC / + HZ) << TICK_LENGTH_SHIFT; + time_adjust = 0; + } + } +} + +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + return tick_length; +} + + +void __attribute__ ((weak)) notify_arch_cmos_timer(void) +{ + return; +} + +/* adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + long ltemp, mtemp, save_adjust; + s64 freq_adj, temp64; + int result; + + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* Now we validate the data before disabling interrupts */ + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + /* singleshot must not be used with any other mode bits */ + if (txc->modes != ADJ_OFFSET_SINGLESHOT) + return -EINVAL; + + if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) + /* adjustment Offset limited to +- .512 seconds */ + if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) + return -EINVAL; + + /* if the quartz is off by more than 10% something is VERY wrong ! */ + if (txc->modes & ADJ_TICK) + if (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + result = time_state; /* mostly `TIME_OK' */ + + /* Save for later - semantics of adjtime is to return old value */ + save_adjust = time_adjust; + +#if 0 /* STA_CLOCKERR is never set yet */ + time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ +#endif + /* If there are input parameters, then process them */ + if (txc->modes) + { + if (txc->modes & ADJ_STATUS) /* only set allowed bits */ + time_status = (txc->status & ~STA_RONLY) | + (time_status & STA_RONLY); + + if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ + if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { + result = -EINVAL; + goto leave; + } + time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + } + + if (txc->modes & ADJ_MAXERROR) { + if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_maxerror = txc->maxerror; + } + + if (txc->modes & ADJ_ESTERROR) { + if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { + result = -EINVAL; + goto leave; + } + time_esterror = txc->esterror; + } + + if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ + if (txc->constant < 0) { /* NTP v4 uses values > 6 */ + result = -EINVAL; + goto leave; + } + time_constant = min(txc->constant + 4, (long)MAXTC); + } + + if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ + if (txc->modes == ADJ_OFFSET_SINGLESHOT) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + } + else if (time_status & STA_PLL) { + ltemp = txc->offset * NSEC_PER_USEC; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC); + time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC); + + /* + * Select whether the frequency is to be controlled + * and in which mode (PLL or FLL). Clamp to the operating + * range. Ugly multiply/divide should be replaced someday. + */ + + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + + freq_adj = (s64)time_offset * mtemp; + freq_adj = shift_right(freq_adj, time_constant * 2 + + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); + if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL); + if (time_offset < 0) { + temp64 = -temp64; + do_div(temp64, mtemp); + freq_adj -= temp64; + } else { + do_div(temp64, mtemp); + freq_adj += temp64; + } + } + freq_adj += time_freq; + freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); + time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); + time_offset = (time_offset / HZ) << SHIFT_UPDATE; + } /* STA_PLL */ + } /* txc->modes & ADJ_OFFSET */ + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); + } /* txc->modes */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) + result = TIME_ERROR; + + if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) + txc->offset = save_adjust; + else + txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ; + txc->tick = tick_usec; + + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; + write_sequnlock_irq(&xtime_lock); + do_gettimeofday(&txc->time); + notify_arch_cmos_timer(); + return(result); +} diff --git a/kernel/timer.c b/kernel/timer.c index 1d7dd6267c2d..0256ab443d8a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -41,12 +41,6 @@ #include <asm/timex.h> #include <asm/io.h> -#ifdef CONFIG_TIME_INTERPOLATION -static void time_interpolator_update(long delta_nsec); -#else -#define time_interpolator_update(x) -#endif - u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -86,6 +80,138 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + */ + if (rem < HZ/4) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + /* + * In theory the following code can skip a jiffy in case jiffies + * increments right between the addition and the later subtraction. + * However since the entire point of this function is to use approximate + * timeouts, it's entirely ok to not handle that. + */ + return __round_jiffies(j + jiffies, cpu) - jiffies; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return __round_jiffies(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the "j" parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + + static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { @@ -136,7 +262,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } -/*** +/** * init_timer - initialize a timer. * @timer: the timer to be initialized * @@ -175,6 +301,7 @@ static inline void detach_timer(struct timer_list *timer, */ static tvec_base_t *lock_timer_base(struct timer_list *timer, unsigned long *flags) + __acquires(timer->base->lock) { tvec_base_t *base; @@ -235,7 +362,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(__mod_timer); -/*** +/** * add_timer_on - start a timer on a particular CPU * @timer: the timer to be added * @cpu: the CPU to start it on @@ -255,9 +382,10 @@ void add_timer_on(struct timer_list *timer, int cpu) } -/*** +/** * mod_timer - modify a timer's timeout * @timer: the timer to be modified + * @expires: new timeout in jiffies * * mod_timer is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) @@ -291,7 +419,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(mod_timer); -/*** +/** * del_timer - deactive a timer. * @timer: the timer to be deactivated * @@ -323,7 +451,10 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP -/* +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. * @@ -351,7 +482,7 @@ out: return ret; } -/*** +/** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated * @@ -401,15 +532,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) return index; } -/*** +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. * * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; @@ -563,12 +694,6 @@ found: /******************************************************************/ -/* - * Timekeeping variables - */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */ - /* * The current time * wall_to_monotonic is what we need to add to xtime (or xtime corrected @@ -582,209 +707,6 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); EXPORT_SYMBOL(xtime); -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ - - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (us) */ -long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; - /* frequency offset (scaled ppm)*/ -static long time_adj; /* tick adjust (scaled 1 / HZ) */ -long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -long time_next_adjust; - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - * - */ -static void second_overflow(void) -{ - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. The microtime() - * routine or external clock driver will insure that reported time is - * always monotonic. The ugly divides should be replaced. - */ - switch (time_state) { - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* - * The timer interpolator will make time change - * gradually instead of an immediate jump by one second - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second " - "23:59:60 UTC\n"); - } - break; - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* - * Use of time interpolator for a gradual change of - * time - */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second " - "23:59:59 UTC\n"); - } - break; - case TIME_OOP: - time_state = TIME_WAIT; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In PLL mode, the - * offset is reduced by a fixed factor times the time constant. In FLL - * mode the offset is used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread the adjustment - * over not more than the number of seconds between updates. - */ - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp = shift_right(ltemp, SHIFT_KG + time_constant); - ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); - ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - - /* - * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. - */ - ltemp = time_freq; - time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); - -#if HZ == 100 - /* - * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to - * get 128.125; => only 0.125% error (p. 14) - */ - time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); -#endif -#if HZ == 250 - /* - * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 255.85938; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -#if HZ == 1000 - /* - * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and - * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); -#endif -} - -/* - * Returns how many microseconds we need to add to xtime this tick - * in doing an adjustment requested with adjtime. - */ -static long adjtime_adjustment(void) -{ - long time_adjust_step; - - time_adjust_step = time_adjust; - if (time_adjust_step) { - /* - * We are doing an adjtime thing. Prepare time_adjust_step to - * be within bounds. Note that a positive time_adjust means we - * want the clock to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - time_adjust_step = min(time_adjust_step, (long)tickadj); - time_adjust_step = max(time_adjust_step, (long)-tickadj); - } - return time_adjust_step; -} - -/* in the NTP reference this is called "hardclock()" */ -static void update_ntp_one_tick(void) -{ - long time_adjust_step; - - time_adjust_step = adjtime_adjustment(); - if (time_adjust_step) - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - - /* Changes by adjtime() do not take effect till next tick. */ - if (time_next_adjust != 0) { - time_adjust = time_next_adjust; - time_next_adjust = 0; - } -} - -/* - * Return how long ticks are at the moment, that is, how much time - * update_wall_time_one_tick will add to xtime next time we call it - * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds shifted by the - * specified number of bits to the right of the binary point. - * This function has no side-effects. - */ -u64 current_tick_length(void) -{ - long delta_nsec; - u64 ret; - - /* calculate the finest interval NTP will allow. - * ie: nanosecond value shifted by (SHIFT_SCALE - 10) - */ - delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; - ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); - - return ret; -} /* XXX - all of this timekeeping code should be later moved to time.c */ #include <linux/clocksource.h> @@ -924,7 +846,7 @@ static int change_clocksource(void) clock = new; clock->cycle_last = now; printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); + clock->name); return 1; } else if (clock->update_callback) { return clock->update_callback(); @@ -932,7 +854,10 @@ static int change_clocksource(void) return 0; } #else -#define change_clocksource() (0) +static inline int change_clocksource(void) +{ + return 0; +} #endif /** @@ -961,21 +886,24 @@ void __init timekeeping_init(void) unsigned long flags; write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + clock = clocksource_get_next(); clocksource_calculate_interval(clock, tick_nsec); clock->cycle_last = clocksource_read(clock); - ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); } static int timekeeping_suspended; -/* +/** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused * * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ static int timekeeping_resume(struct sys_device *dev) @@ -1027,7 +955,8 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) { s64 tick_error, i; u32 look_ahead, adj; @@ -1051,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); tick_error -= clock->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -1103,10 +1033,11 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) clock->mult += adj; clock->xtime_interval += interval; clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); } -/* +/** * update_wall_time - Uses the current clocksource to increment the wall time * * Called from the timer interrupt, must hold a write on xtime_lock. @@ -1144,8 +1075,6 @@ static void update_wall_time(void) /* interpolator bits */ time_interpolator_update(clock->xtime_interval >> clock->shift); - /* increment the NTP state machine */ - update_ntp_one_tick(); /* accumulate error between NTP and clock interval */ clock->error += current_tick_length(); @@ -1217,19 +1146,14 @@ static inline void calc_load(unsigned long ticks) unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; - count -= ticks; - if (count < 0) { - count += LOAD_FREQ; - active_tasks = count_active_tasks(); + active_tasks = count_active_tasks(); + for (count -= ticks; count < 0; count += LOAD_FREQ) { CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); } } -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies = INITIAL_JIFFIES; - /* * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. @@ -1265,12 +1189,8 @@ void run_local_timers(void) * Called by the timer interrupt. xtime_lock must already be taken * by the timer IRQ! */ -static inline void update_times(void) +static inline void update_times(unsigned long ticks) { - unsigned long ticks; - - ticks = jiffies - wall_jiffies; - wall_jiffies += ticks; update_wall_time(); calc_load(ticks); } @@ -1281,12 +1201,10 @@ static inline void update_times(void) * jiffies is defined in the linker script... */ -void do_timer(struct pt_regs *regs) +void do_timer(unsigned long ticks) { - jiffies_64++; - /* prevent loading jiffies before storing new jiffies_64 value. */ - barrier(); - update_times(); + jiffies_64 += ticks; + update_times(ticks); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1470,8 +1388,9 @@ asmlinkage long sys_gettid(void) return current->pid; } -/* +/** * sys_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill */ asmlinkage long sys_sysinfo(struct sysinfo __user *info) { @@ -1688,8 +1607,10 @@ static struct notifier_block __cpuinitdata timers_nb = { void __init init_timers(void) { - timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); } @@ -1774,7 +1695,7 @@ unsigned long time_interpolator_get_offset(void) #define INTERPOLATOR_ADJUST 65536 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST -static void time_interpolator_update(long delta_nsec) +void time_interpolator_update(long delta_nsec) { u64 counter; unsigned long offset; diff --git a/kernel/tsacct.c b/kernel/tsacct.c new file mode 100644 index 000000000000..baacc3691415 --- /dev/null +++ b/kernel/tsacct.c @@ -0,0 +1,140 @@ +/* + * tsacct.c - System accounting over taskstats interface + * + * Copyright (C) Jay Lan, <jlan@sgi.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/acct.h> +#include <linux/jiffies.h> + + +#define USEC_PER_TICK (USEC_PER_SEC/HZ) +/* + * fill in basic accounting fields + */ +void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + struct timespec uptime, ts; + s64 ac_etime; + + BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); + + /* calculate task elapsed time in timespec */ + do_posix_clock_monotonic_gettime(&uptime); + ts = timespec_sub(uptime, tsk->start_time); + /* rebase elapsed time to usec */ + ac_etime = timespec_to_ns(&ts); + do_div(ac_etime, NSEC_PER_USEC); + stats->ac_etime = ac_etime; + stats->ac_btime = xtime.tv_sec - ts.tv_sec; + if (thread_group_leader(tsk)) { + stats->ac_exitcode = tsk->exit_code; + if (tsk->flags & PF_FORKNOEXEC) + stats->ac_flag |= AFORK; + } + if (tsk->flags & PF_SUPERPRIV) + stats->ac_flag |= ASU; + if (tsk->flags & PF_DUMPCORE) + stats->ac_flag |= ACORE; + if (tsk->flags & PF_SIGNALED) + stats->ac_flag |= AXSIG; + stats->ac_nice = task_nice(tsk); + stats->ac_sched = tsk->policy; + stats->ac_uid = tsk->uid; + stats->ac_gid = tsk->gid; + stats->ac_pid = tsk->pid; + rcu_read_lock(); + stats->ac_ppid = pid_alive(tsk) ? + rcu_dereference(tsk->real_parent)->tgid : 0; + rcu_read_unlock(); + stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; + stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; + stats->ac_minflt = tsk->min_flt; + stats->ac_majflt = tsk->maj_flt; + + strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); +} + + +#ifdef CONFIG_TASK_XACCT + +#define KB 1024 +#define MB (1024*KB) +/* + * fill in extended accounting fields + */ +void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) +{ + struct mm_struct *mm; + + /* convert pages-jiffies to Mbyte-usec */ + stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; + stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; + mm = get_task_mm(p); + if (mm) { + /* adjust to KB unit */ + stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; + stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + mmput(mm); + } + stats->read_char = p->rchar; + stats->write_char = p->wchar; + stats->read_syscalls = p->syscr; + stats->write_syscalls = p->syscw; +#ifdef CONFIG_TASK_IO_ACCOUNTING + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; +#else + stats->read_bytes = 0; + stats->write_bytes = 0; + stats->cancelled_write_bytes = 0; +#endif +} +#undef KB +#undef MB + +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + if (likely(tsk->mm)) { + long delta = cputime_to_jiffies( + cputime_sub(tsk->stime, tsk->acct_stimexpd)); + + if (delta == 0) + return; + tsk->acct_stimexpd = tsk->stime; + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + } +} + +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; +} +#endif diff --git a/kernel/unwind.c b/kernel/unwind.c index 3430475fcd88..09c261329249 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -11,13 +11,16 @@ #include <linux/unwind.h> #include <linux/module.h> -#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/sort.h> #include <linux/stop_machine.h> +#include <linux/uaccess.h> #include <asm/sections.h> #include <asm/uaccess.h> #include <asm/unaligned.h> -extern char __start_unwind[], __end_unwind[]; +extern const char __start_unwind[], __end_unwind[]; +extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; #define MAX_STACK_DEPTH 8 @@ -92,6 +95,7 @@ static const struct { typedef unsigned long uleb128_t; typedef signed long sleb128_t; +#define sleb128abs __builtin_labs static struct unwind_table { struct { @@ -100,9 +104,11 @@ static struct unwind_table { } core, init; const void *address; unsigned long size; + const unsigned char *header; + unsigned long hdrsz; struct unwind_table *link; const char *name; -} root_table, *last_table; +} root_table; struct unwind_item { enum item_location { @@ -131,6 +137,17 @@ struct unwind_state { static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; +static unsigned unwind_debug; +static int __init unwind_debug_setup(char *s) +{ + unwind_debug = simple_strtoul(s, NULL, 0); + return 1; +} +__setup("unwind_debug=", unwind_debug_setup); +#define dprintk(lvl, fmt, args...) \ + ((void)(lvl > unwind_debug \ + || printk(KERN_DEBUG "unwind: " fmt "\n", ##args))) + static struct unwind_table *find_table(unsigned long pc) { struct unwind_table *table; @@ -145,6 +162,12 @@ static struct unwind_table *find_table(unsigned long pc) return table; } +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType, + unsigned long text_base, + unsigned long data_base); + static void init_unwind_table(struct unwind_table *table, const char *name, const void *core_start, @@ -152,14 +175,33 @@ static void init_unwind_table(struct unwind_table *table, const void *init_start, unsigned long init_size, const void *table_start, - unsigned long table_size) + unsigned long table_size, + const u8 *header_start, + unsigned long header_size) { + const u8 *ptr = header_start + 4; + const u8 *end = header_start + header_size; + table->core.pc = (unsigned long)core_start; table->core.range = core_size; table->init.pc = (unsigned long)init_start; table->init.range = init_size; table->address = table_start; table->size = table_size; + /* See if the linker provided table looks valid. */ + if (header_size <= 4 + || header_start[0] != 1 + || (void *)read_pointer(&ptr, end, header_start[1], 0, 0) + != table_start + || !read_pointer(&ptr, end, header_start[2], 0, 0) + || !read_pointer(&ptr, end, header_start[3], 0, + (unsigned long)header_start) + || !read_pointer(&ptr, end, header_start[3], 0, + (unsigned long)header_start)) + header_start = NULL; + table->hdrsz = header_size; + smp_wmb(); + table->header = header_start; table->link = NULL; table->name = name; } @@ -169,11 +211,150 @@ void __init unwind_init(void) init_unwind_table(&root_table, "kernel", _text, _end - _text, NULL, 0, - __start_unwind, __end_unwind - __start_unwind); + __start_unwind, __end_unwind - __start_unwind, + __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); +} + +static const u32 bad_cie, not_fde; +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); +static signed fde_pointer_type(const u32 *cie); + +struct eh_frame_hdr_table_entry { + unsigned long start, fde; +}; + +static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) +{ + const struct eh_frame_hdr_table_entry *e1 = p1; + const struct eh_frame_hdr_table_entry *e2 = p2; + + return (e1->start > e2->start) - (e1->start < e2->start); +} + +static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) +{ + struct eh_frame_hdr_table_entry *e1 = p1; + struct eh_frame_hdr_table_entry *e2 = p2; + unsigned long v; + + v = e1->start; + e1->start = e2->start; + e2->start = v; + v = e1->fde; + e1->fde = e2->fde; + e2->fde = v; +} + +static void __init setup_unwind_table(struct unwind_table *table, + void *(*alloc)(unsigned long)) +{ + const u8 *ptr; + unsigned long tableSize = table->size, hdrSize; + unsigned n; + const u32 *fde; + struct { + u8 version; + u8 eh_frame_ptr_enc; + u8 fde_count_enc; + u8 table_enc; + unsigned long eh_frame_ptr; + unsigned int fde_count; + struct eh_frame_hdr_table_entry table[]; + } __attribute__((__packed__)) *header; + + if (table->header) + return; + + if (table->hdrsz) + printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", + table->name); + + if (tableSize & (sizeof(*fde) - 1)) + return; + + for (fde = table->address, n = 0; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = cie_for_fde(fde, table); + signed ptrType; + + if (cie == ¬_fde) + continue; + if (cie == NULL + || cie == &bad_cie + || (ptrType = fde_pointer_type(cie)) < 0) + return; + ptr = (const u8 *)(fde + 2); + if (!read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType, 0, 0)) + return; + ++n; + } + + if (tableSize || !n) + return; + + hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) + + 2 * n * sizeof(unsigned long); + dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize); + header = alloc(hdrSize); + if (!header) + return; + header->version = 1; + header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; + header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; + header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; + put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); + BUILD_BUG_ON(offsetof(typeof(*header), fde_count) + % __alignof(typeof(header->fde_count))); + header->fde_count = n; + + BUILD_BUG_ON(offsetof(typeof(*header), table) + % __alignof(typeof(*header->table))); + for (fde = table->address, tableSize = table->size, n = 0; + tableSize; + tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { + const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); + + if (!fde[1]) + continue; /* this is a CIE */ + ptr = (const u8 *)(fde + 2); + header->table[n].start = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + fde_pointer_type(cie), 0, 0); + header->table[n].fde = (unsigned long)fde; + ++n; + } + WARN_ON(n != header->fde_count); + + sort(header->table, + n, + sizeof(*header->table), + cmp_eh_frame_hdr_table_entries, + swap_eh_frame_hdr_table_entries); + + table->hdrsz = hdrSize; + smp_wmb(); + table->header = (const void *)header; +} + +static void *__init balloc(unsigned long sz) +{ + return __alloc_bootmem_nopanic(sz, + sizeof(unsigned int), + __pa(MAX_DMA_ADDRESS)); +} + +void __init unwind_setup(void) +{ + setup_unwind_table(&root_table, balloc); } #ifdef CONFIG_MODULES +static struct unwind_table *last_table; + /* Must be called with module_mutex held. */ void *unwind_add_table(struct module *module, const void *table_start, @@ -191,7 +372,8 @@ void *unwind_add_table(struct module *module, init_unwind_table(table, module->name, module->module_core, module->core_size, module->module_init, module->init_size, - table_start, table_size); + table_start, table_size, + NULL, 0); if (last_table) last_table->link = table; @@ -301,9 +483,31 @@ static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) return value; } +static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) +{ + const u32 *cie; + + if (!*fde || (*fde & (sizeof(*fde) - 1))) + return &bad_cie; + if (!fde[1]) + return ¬_fde; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) + return NULL; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1]) + return NULL; /* this is not a (valid) CIE */ + return cie; +} + static unsigned long read_pointer(const u8 **pLoc, const void *end, - signed ptrType) + signed ptrType, + unsigned long text_base, + unsigned long data_base) { unsigned long value = 0; union { @@ -315,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc, const unsigned long *pul; } ptr; - if (ptrType < 0 || ptrType == DW_EH_PE_omit) + if (ptrType < 0 || ptrType == DW_EH_PE_omit) { + dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end); return 0; + } ptr.p8 = *pLoc; switch(ptrType & DW_EH_PE_FORM) { case DW_EH_PE_data2: - if (end < (const void *)(ptr.p16u + 1)) + if (end < (const void *)(ptr.p16u + 1)) { + dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end); return 0; + } if(ptrType & DW_EH_PE_signed) value = get_unaligned(ptr.p16s++); else @@ -329,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc, break; case DW_EH_PE_data4: #ifdef CONFIG_64BIT - if (end < (const void *)(ptr.p32u + 1)) + if (end < (const void *)(ptr.p32u + 1)) { + dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end); return 0; + } if(ptrType & DW_EH_PE_signed) value = get_unaligned(ptr.p32s++); else @@ -342,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc, BUILD_BUG_ON(sizeof(u32) != sizeof(value)); #endif case DW_EH_PE_native: - if (end < (const void *)(ptr.pul + 1)) + if (end < (const void *)(ptr.pul + 1)) { + dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end); return 0; + } value = get_unaligned(ptr.pul++); break; case DW_EH_PE_leb128: @@ -351,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc, value = ptrType & DW_EH_PE_signed ? get_sleb128(&ptr.p8, end) : get_uleb128(&ptr.p8, end); - if ((const void *)ptr.p8 > end) + if ((const void *)ptr.p8 > end) { + dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end); return 0; + } break; default: + dprintk(2, "Cannot decode pointer type %02X (%p,%p).", + ptrType, ptr.p8, end); return 0; } switch(ptrType & DW_EH_PE_ADJUST) { @@ -363,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc, case DW_EH_PE_pcrel: value += (unsigned long)*pLoc; break; + case DW_EH_PE_textrel: + if (likely(text_base)) { + value += text_base; + break; + } + dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.", + ptrType, *pLoc, end); + return 0; + case DW_EH_PE_datarel: + if (likely(data_base)) { + value += data_base; + break; + } + dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.", + ptrType, *pLoc, end); + return 0; default: + dprintk(2, "Cannot adjust pointer type %02X (%p,%p).", + ptrType, *pLoc, end); return 0; } if ((ptrType & DW_EH_PE_indirect) - && __get_user(value, (unsigned long *)value)) + && probe_kernel_address((unsigned long *)value, value)) { + dprintk(1, "Cannot read indirect value %lx (%p,%p).", + value, *pLoc, end); return 0; + } *pLoc = ptr.p8; return value; @@ -411,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie) case 'P': { signed ptrType = *ptr++; - if (!read_pointer(&ptr, end, ptrType) || ptr > end) + if (!read_pointer(&ptr, end, ptrType, 0, 0) + || ptr > end) return -1; } break; @@ -471,7 +709,8 @@ static int processCFI(const u8 *start, case DW_CFA_nop: break; case DW_CFA_set_loc: - if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) + state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0); + if (state->loc == 0) result = 0; break; case DW_CFA_advance_loc1: @@ -517,8 +756,10 @@ static int processCFI(const u8 *start, state->label = NULL; return 1; } - if (state->stackDepth >= MAX_STACK_DEPTH) + if (state->stackDepth >= MAX_STACK_DEPTH) { + dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end); return 0; + } state->stack[state->stackDepth++] = ptr.p8; break; case DW_CFA_restore_state: @@ -533,8 +774,10 @@ static int processCFI(const u8 *start, result = processCFI(start, end, 0, ptrType, state); state->loc = loc; state->label = label; - } else + } else { + dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end); return 0; + } break; case DW_CFA_def_cfa: state->cfa.reg = get_uleb128(&ptr.p8, end); @@ -566,6 +809,7 @@ static int processCFI(const u8 *start, break; case DW_CFA_GNU_window_save: default: + dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end); result = 0; break; } @@ -581,12 +825,17 @@ static int processCFI(const u8 *start, set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); break; } - if (ptr.p8 > end) + if (ptr.p8 > end) { + dprintk(1, "Data overrun (%p,%p).", ptr.p8, end); result = 0; + } if (result && targetLoc != 0 && targetLoc < state->loc) return 1; } + if (result && ptr.p8 < end) + dprintk(1, "Data underrun (%p,%p).", ptr.p8, end); + return result && ptr.p8 == end && (targetLoc == 0 @@ -603,54 +852,122 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; - unsigned long pc = UNW_PC(frame) - frame->call_frame; + unsigned long pc = UNW_PC(frame) - frame->call_frame, sp; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; uleb128_t retAddrReg = 0; - struct unwind_table *table; + const struct unwind_table *table; struct unwind_state state; if (UNW_PC(frame) == 0) return -EINVAL; if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { - unsigned long tableSize = table->size; - - for (fde = table->address; - tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; - tableSize -= sizeof(*fde) + *fde, - fde += 1 + *fde / sizeof(*fde)) { - if (!*fde || (*fde & (sizeof(*fde) - 1))) - break; - if (!fde[1]) - continue; /* this is a CIE */ - if ((fde[1] & (sizeof(*fde) - 1)) - || fde[1] > (unsigned long)(fde + 1) - - (unsigned long)table->address) - continue; /* this is not a valid FDE */ - cie = fde + 1 - fde[1] / sizeof(*fde); - if (*cie <= sizeof(*cie) + 4 - || *cie >= fde[1] - sizeof(*fde) - || (*cie & (sizeof(*cie) - 1)) - || cie[1] - || (ptrType = fde_pointer_type(cie)) < 0) { - cie = NULL; /* this is not a (valid) CIE */ - continue; + const u8 *hdr = table->header; + unsigned long tableSize; + + smp_rmb(); + if (hdr && hdr[0] == 1) { + switch(hdr[3] & DW_EH_PE_FORM) { + case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; + case DW_EH_PE_data2: tableSize = 2; break; + case DW_EH_PE_data4: tableSize = 4; break; + case DW_EH_PE_data8: tableSize = 8; break; + default: tableSize = 0; break; } + ptr = hdr + 4; + end = hdr + table->hdrsz; + if (tableSize + && read_pointer(&ptr, end, hdr[1], 0, 0) + == (unsigned long)table->address + && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0 + && i == (end - ptr) / (2 * tableSize) + && !((end - ptr) % (2 * tableSize))) { + do { + const u8 *cur = ptr + (i / 2) * (2 * tableSize); + + startLoc = read_pointer(&cur, + cur + tableSize, + hdr[3], 0, + (unsigned long)hdr); + if (pc < startLoc) + i /= 2; + else { + ptr = cur - tableSize; + i = (i + 1) / 2; + } + } while (startLoc && i > 1); + if (i == 1 + && (startLoc = read_pointer(&ptr, + ptr + tableSize, + hdr[3], 0, + (unsigned long)hdr)) != 0 + && pc >= startLoc) + fde = (void *)read_pointer(&ptr, + ptr + tableSize, + hdr[3], 0, + (unsigned long)hdr); + } + } + if(hdr && !fde) + dprintk(3, "Binary lookup for %lx failed.", pc); + + if (fde != NULL) { + cie = cie_for_fde(fde, table); ptr = (const u8 *)(fde + 2); - startLoc = read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType); - endLoc = startLoc - + read_pointer(&ptr, - (const u8 *)(fde + 1) + *fde, - ptrType & DW_EH_PE_indirect - ? ptrType - : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (pc >= startLoc && pc < endLoc) - break; - cie = NULL; + if(cie != NULL + && cie != &bad_cie + && cie != ¬_fde + && (ptrType = fde_pointer_type(cie)) >= 0 + && read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType, 0, 0) == startLoc) { + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType, 0, 0); + if(pc >= endLoc) + fde = NULL; + } else + fde = NULL; + if(!fde) + dprintk(1, "Binary lookup result for %lx discarded.", pc); + } + if (fde == NULL) { + for (fde = table->address, tableSize = table->size; + cie = NULL, tableSize > sizeof(*fde) + && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + cie = cie_for_fde(fde, table); + if (cie == &bad_cie) { + cie = NULL; + break; + } + if (cie == NULL + || cie == ¬_fde + || (ptrType = fde_pointer_type(cie)) < 0) + continue; + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType, 0, 0); + if (!startLoc) + continue; + if (!(ptrType & DW_EH_PE_indirect)) + ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType, 0, 0); + if (pc >= startLoc && pc < endLoc) + break; + } + if(!fde) + dprintk(3, "Linear lookup for %lx failed.", pc); } } if (cie != NULL) { @@ -684,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame) if (ptr >= end || *ptr) cie = NULL; } + if(!cie) + dprintk(1, "CIE unusable (%p,%p).", ptr, end); ++ptr; } if (cie != NULL) { @@ -693,17 +1012,27 @@ int unwind(struct unwind_frame_info *frame) state.dataAlign = get_sleb128(&ptr, end); if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) cie = NULL; - else { + else if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign)) { + dprintk(1, "Input pointer(s) misaligned (%lx,%lx).", + UNW_PC(frame), UNW_SP(frame)); + return -EPERM; + } else { retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); /* skip augmentation */ - if (((const char *)(cie + 2))[1] == 'z') - ptr += get_uleb128(&ptr, end); + if (((const char *)(cie + 2))[1] == 'z') { + uleb128_t augSize = get_uleb128(&ptr, end); + + ptr += augSize; + } if (ptr > end || retAddrReg >= ARRAY_SIZE(reg_info) || REG_INVALID(retAddrReg) || reg_info[retAddrReg].width != sizeof(unsigned long)) cie = NULL; } + if(!cie) + dprintk(1, "CIE validation failed (%p,%p).", ptr, end); } if (cie != NULL) { state.cieStart = ptr; @@ -717,13 +1046,15 @@ int unwind(struct unwind_frame_info *frame) if ((ptr += augSize) > end) fde = NULL; } + if(!fde) + dprintk(1, "FDE validation failed (%p,%p).", ptr, end); } if (cie == NULL || fde == NULL) { #ifdef CONFIG_FRAME_POINTER unsigned long top, bottom; -#endif -#ifdef CONFIG_FRAME_POINTER + if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long)) + return -EPERM; top = STACK_TOP(frame->task); bottom = STACK_BOTTOM(frame->task); # if FRAME_RETADDR_OFFSET < 0 @@ -739,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame) & (sizeof(unsigned long) - 1))) { unsigned long link; - if (!__get_user(link, + if (!probe_kernel_address( (unsigned long *)(UNW_FP(frame) - + FRAME_LINK_OFFSET)) + + FRAME_LINK_OFFSET), + link) # if FRAME_RETADDR_OFFSET < 0 && link > bottom && link < UNW_FP(frame) # else && link > UNW_FP(frame) && link < bottom # endif && !(link & (sizeof(link) - 1)) - && !__get_user(UNW_PC(frame), + && !probe_kernel_address( (unsigned long *)(UNW_FP(frame) - + FRAME_RETADDR_OFFSET))) { + + FRAME_RETADDR_OFFSET), UNW_PC(frame))) { UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET # if FRAME_RETADDR_OFFSET < 0 - @@ -773,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame) || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) || reg_info[state.cfa.reg].width != sizeof(unsigned long) - || state.cfa.offs % sizeof(unsigned long)) + || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long) + || state.cfa.offs % sizeof(unsigned long)) { + dprintk(1, "Unusable unwind info (%p,%p).", ptr, end); return -EIO; + } /* update frame */ #ifndef CONFIG_AS_CFI_SIGNAL_FRAME if(frame->call_frame @@ -793,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame) #else # define CASES CASE(8); CASE(16); CASE(32); CASE(64) #endif + pc = UNW_PC(frame); + sp = UNW_SP(frame); for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { if (REG_INVALID(i)) { if (state.regs[i].where == Nowhere) continue; + dprintk(1, "Cannot restore register %u (%d).", + i, state.regs[i].where); return -EIO; } switch(state.regs[i].where) { @@ -805,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame) case Register: if (state.regs[i].value >= ARRAY_SIZE(reg_info) || REG_INVALID(state.regs[i].value) - || reg_info[i].width > reg_info[state.regs[i].value].width) + || reg_info[i].width > reg_info[state.regs[i].value].width) { + dprintk(1, "Cannot restore register %u from register %lu.", + i, state.regs[i].value); return -EIO; + } switch(reg_info[state.regs[i].value].width) { #define CASE(n) \ case sizeof(u##n): \ @@ -816,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame) CASES; #undef CASE default: + dprintk(1, "Unsupported register size %u (%lu).", + reg_info[state.regs[i].value].width, + state.regs[i].value); return -EIO; } break; @@ -840,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame) CASES; #undef CASE default: + dprintk(1, "Unsupported register size %u (%u).", + reg_info[i].width, i); return -EIO; } break; case Value: - if (reg_info[i].width != sizeof(unsigned long)) + if (reg_info[i].width != sizeof(unsigned long)) { + dprintk(1, "Unsupported value size %u (%u).", + reg_info[i].width, i); return -EIO; + } FRAME_REG(i, unsigned long) = cfa + state.regs[i].value * state.dataAlign; break; @@ -857,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame) % sizeof(unsigned long) || addr < startLoc || addr + sizeof(unsigned long) < addr - || addr + sizeof(unsigned long) > endLoc) + || addr + sizeof(unsigned long) > endLoc) { + dprintk(1, "Bad memory location %lx (%lx).", + addr, state.regs[i].value); return -EIO; + } switch(reg_info[i].width) { #define CASE(n) case sizeof(u##n): \ - __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ + probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \ break CASES; #undef CASE default: + dprintk(1, "Unsupported memory size %u (%u).", + reg_info[i].width, i); return -EIO; } } @@ -873,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame) } } + if (UNW_PC(frame) % state.codeAlign + || UNW_SP(frame) % sleb128abs(state.dataAlign)) { + dprintk(1, "Output pointer(s) misaligned (%lx,%lx).", + UNW_PC(frame), UNW_SP(frame)); + return -EIO; + } + if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) { + dprintk(1, "No progress (%lx,%lx).", pc, sp); + return -EIO; + } + return 0; #undef CASES #undef FRAME_REG diff --git a/kernel/user.c b/kernel/user.c index 6408c0424291..4869563080e9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,7 +26,7 @@ #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) -static kmem_cache_t *uid_cachep; +static struct kmem_cache *uid_cachep; static struct list_head uidhash_table[UIDHASH_SZ]; /* @@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid) if (!up) { struct user_struct *new; - new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); + new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); if (!new) return NULL; new->uid = uid; @@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user) atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + + /* + * We need to synchronize with __sigqueue_alloc() + * doing a get_uid(p->user).. If that saw the old + * user value, we need to wait until it has exited + * its critical region before we can free the old + * structure. + */ + smp_mb(); + spin_unlock_wait(¤t->sighand->siglock); + free_uid(old_user); suid_keys(current); } diff --git a/kernel/utsname.c b/kernel/utsname.c new file mode 100644 index 000000000000..c859164a6993 --- /dev/null +++ b/kernel/utsname.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <linux/version.h> + +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (ns) { + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); + } + return ns; +} + +/* + * unshare the current process' utsname namespace. + * called only in sys_unshare() + */ +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_uts = clone_uts_ns(current->nsproxy->uts_ns); + if (!*new_uts) + return -ENOMEM; + } + + return 0; +} + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +int copy_utsname(int flags, struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_uts_ns(old_ns); + + if (!(flags & CLONE_NEWUTS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = clone_uts_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + tsk->nsproxy->uts_ns = new_ns; + +out: + put_uts_ns(old_ns); + return err; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + kfree(ns); +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 835fe28b87a8..db49886bfae1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -28,13 +28,17 @@ #include <linux/notifier.h> #include <linux/kthread.h> #include <linux/hardirq.h> +#include <linux/mempolicy.h> +#include <linux/freezer.h> +#include <linux/kallsyms.h> +#include <linux/debug_locks.h> /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). * * The sequence counters are for flush_scheduled_work(). It wants to wait - * until until all currently-scheduled works are completed, but it doesn't + * until all currently-scheduled works are completed, but it doesn't * want to be livelocked by new, incoming ones. So it waits until * remove_sequence is >= the insert_sequence which pertained when * flush_scheduled_work() was called. @@ -54,6 +58,8 @@ struct cpu_workqueue_struct { struct task_struct *thread; int run_depth; /* Detect run_workqueue() recursion depth */ + + int freezeable; /* Freeze the thread during suspend */ } ____cacheline_aligned; /* @@ -79,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq) return list_empty(&wq->list); } +/* + * Set the workqueue on which a work item is to be run + * - Must *only* be called if the pending flag is set + */ +static inline void set_wq_data(struct work_struct *work, void *wq) +{ + unsigned long new; + + BUG_ON(!work_pending(work)); + + new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); + new |= work->management & WORK_STRUCT_FLAG_MASK; + work->management = new; +} + +static inline void *get_wq_data(struct work_struct *work) +{ + return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK); +} + +static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&cwq->lock, flags); + /* + * We need to re-validate the work info after we've gotten + * the cpu_workqueue lock. We can run the work now iff: + * + * - the wq_data still matches the cpu_workqueue_struct + * - AND the work is still marked pending + * - AND the work is still on a list (which will be this + * workqueue_struct list) + * + * All these conditions are important, because we + * need to protect against the work being run right + * now on another CPU (all but the last one might be + * true if it's currently running and has not been + * released yet, for example). + */ + if (get_wq_data(work) == cwq + && work_pending(work) + && !list_empty(&work->entry)) { + work_func_t f = work->func; + list_del_init(&work->entry); + spin_unlock_irqrestore(&cwq->lock, flags); + + if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) + work_release(work); + f(work); + + spin_lock_irqsave(&cwq->lock, flags); + cwq->remove_sequence++; + wake_up(&cwq->work_done); + ret = 1; + } + spin_unlock_irqrestore(&cwq->lock, flags); + return ret; +} + +/** + * run_scheduled_work - run scheduled work synchronously + * @work: work to run + * + * This checks if the work was pending, and runs it + * synchronously if so. It returns a boolean to indicate + * whether it had any scheduled work to run or not. + * + * NOTE! This _only_ works for normal work_structs. You + * CANNOT use this for delayed work, because the wq data + * for delayed work will not point properly to the per- + * CPU workqueue struct, but will change! + */ +int fastcall run_scheduled_work(struct work_struct *work) +{ + for (;;) { + struct cpu_workqueue_struct *cwq; + + if (!work_pending(work)) + return 0; + if (list_empty(&work->entry)) + return 0; + /* NOTE! This depends intimately on __queue_work! */ + cwq = get_wq_data(work); + if (!cwq) + return 0; + if (__run_work(cwq, work)) + return 1; + } +} +EXPORT_SYMBOL(run_scheduled_work); + /* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) @@ -86,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, unsigned long flags; spin_lock_irqsave(&cwq->lock, flags); - work->wq_data = cwq; + set_wq_data(work, cwq); list_add_tail(&work->entry, &cwq->worklist); cwq->insert_sequence++; wake_up(&cwq->more_work); @@ -98,7 +197,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * @wq: workqueue to use * @work: work to queue * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -107,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { int ret = 0, cpu = get_cpu(); - if (!test_and_set_bit(0, &work->pending)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; BUG_ON(!list_empty(&work->entry)); @@ -121,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work); static void delayed_work_timer_fn(unsigned long __data) { - struct work_struct *work = (struct work_struct *)__data; - struct workqueue_struct *wq = work->wq_data; + struct delayed_work *dwork = (struct delayed_work *)__data; + struct workqueue_struct *wq = get_wq_data(&dwork->work); int cpu = smp_processor_id(); if (unlikely(is_single_threaded(wq))) cpu = singlethread_cpu; - __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use - * @work: work to queue + * @work: delayable work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int fastcall queue_delayed_work(struct workqueue_struct *wq, - struct work_struct *work, unsigned long delay) + struct delayed_work *dwork, unsigned long delay) { int ret = 0; - struct timer_list *timer = &work->timer; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; - if (!test_and_set_bit(0, &work->pending)) { + if (delay == 0) + return queue_work(wq, work); + + if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); /* This stores wq for the moment, for the timer_fn */ - work->wq_data = wq; + set_wq_data(work, wq); timer->expires = jiffies + delay; - timer->data = (unsigned long)work; + timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; add_timer(timer); ret = 1; @@ -168,22 +271,23 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); * @work: work to queue * @delay: number of jiffies to wait before queueing * - * Returns non-zero if it was successfully added. + * Returns 0 if @work was already on a queue, non-zero otherwise. */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct work_struct *work, unsigned long delay) + struct delayed_work *dwork, unsigned long delay) { int ret = 0; - struct timer_list *timer = &work->timer; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; - if (!test_and_set_bit(0, &work->pending)) { + if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); /* This stores wq for the moment, for the timer_fn */ - work->wq_data = wq; + set_wq_data(work, wq); timer->expires = jiffies + delay; - timer->data = (unsigned long)work; + timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; add_timer_on(timer, cpu); ret = 1; @@ -211,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) while (!list_empty(&cwq->worklist)) { struct work_struct *work = list_entry(cwq->worklist.next, struct work_struct, entry); - void (*f) (void *) = work->func; - void *data = work->data; + work_func_t f = work->func; list_del_init(cwq->worklist.next); spin_unlock_irqrestore(&cwq->lock, flags); - BUG_ON(work->wq_data != cwq); - clear_bit(0, &work->pending); - f(data); + BUG_ON(get_wq_data(work) != cwq); + if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management)) + work_release(work); + f(work); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), + current->pid); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } spin_lock_irqsave(&cwq->lock, flags); cwq->remove_sequence++; @@ -236,7 +351,8 @@ static int worker_thread(void *__cwq) struct k_sigaction sa; sigset_t blocked; - current->flags |= PF_NOFREEZE; + if (!cwq->freezeable) + current->flags |= PF_NOFREEZE; set_user_nice(current, -5); @@ -245,6 +361,12 @@ static int worker_thread(void *__cwq) sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); + /* + * We inherited MPOL_INTERLEAVE from the booting kernel. + * Set MPOL_DEFAULT to insure node local allocations. + */ + numa_default_policy(); + /* SIG_IGN makes children autoreap: see do_notify_parent(). */ sa.sa.sa_handler = SIG_IGN; sa.sa.sa_flags = 0; @@ -253,6 +375,9 @@ static int worker_thread(void *__cwq) set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { + if (cwq->freezeable) + try_to_freeze(); + add_wait_queue(&cwq->more_work, &wait); if (list_empty(&cwq->worklist)) schedule(); @@ -329,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) EXPORT_SYMBOL_GPL(flush_workqueue); static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, - int cpu) + int cpu, int freezeable) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); struct task_struct *p; @@ -339,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, cwq->thread = NULL; cwq->insert_sequence = 0; cwq->remove_sequence = 0; + cwq->freezeable = freezeable; INIT_LIST_HEAD(&cwq->worklist); init_waitqueue_head(&cwq->more_work); init_waitqueue_head(&cwq->work_done); @@ -354,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, } struct workqueue_struct *__create_workqueue(const char *name, - int singlethread) + int singlethread, int freezeable) { int cpu, destroy = 0; struct workqueue_struct *wq; @@ -374,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name, mutex_lock(&workqueue_mutex); if (singlethread) { INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, singlethread_cpu); + p = create_workqueue_thread(wq, singlethread_cpu, freezeable); if (!p) destroy = 1; else @@ -382,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } else { list_add(&wq->list, &workqueues); for_each_online_cpu(cpu) { - p = create_workqueue_thread(wq, cpu); + p = create_workqueue_thread(wq, cpu, freezeable); if (p) { kthread_bind(p, cpu); wake_up_process(p); @@ -461,38 +587,37 @@ EXPORT_SYMBOL(schedule_work); /** * schedule_delayed_work - put work task in global workqueue after delay - * @work: job to be done - * @delay: number of jiffies to wait + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) +int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(keventd_wq, work, delay); + return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use - * @work: job to be done + * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ int schedule_delayed_work_on(int cpu, - struct work_struct *work, unsigned long delay) + struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, keventd_wq, work, delay); + return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); /** * schedule_on_each_cpu - call a function on each online CPU from keventd * @func: the function to call - * @info: a pointer to pass to func() * * Returns zero on success. * Returns -ve errno on failure. @@ -501,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); * * schedule_on_each_cpu() is very slow. */ -int schedule_on_each_cpu(void (*func)(void *info), void *info) +int schedule_on_each_cpu(work_func_t func) { int cpu; struct work_struct *works; @@ -512,7 +637,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info) mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) { - INIT_WORK(per_cpu_ptr(works, cpu), func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), per_cpu_ptr(works, cpu)); } @@ -532,12 +657,12 @@ EXPORT_SYMBOL(flush_scheduled_work); * cancel_rearming_delayed_workqueue - reliably kill off a delayed * work whose handler rearms the delayed work. * @wq: the controlling workqueue structure - * @work: the delayed work struct + * @dwork: the delayed work struct */ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, - struct work_struct *work) + struct delayed_work *dwork) { - while (!cancel_delayed_work(work)) + while (!cancel_delayed_work(dwork)) flush_workqueue(wq); } EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); @@ -545,18 +670,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); /** * cancel_rearming_delayed_work - reliably kill off a delayed keventd * work whose handler rearms the delayed work. - * @work: the delayed work struct + * @dwork: the delayed work struct */ -void cancel_rearming_delayed_work(struct work_struct *work) +void cancel_rearming_delayed_work(struct delayed_work *dwork) { - cancel_rearming_delayed_workqueue(keventd_wq, work); + cancel_rearming_delayed_workqueue(keventd_wq, dwork); } EXPORT_SYMBOL(cancel_rearming_delayed_work); /** * execute_in_process_context - reliably execute the routine with user context * @fn: the function to execute - * @data: data to pass to the function * @ew: guaranteed storage for the execute work structure (must * be available when the work executes) * @@ -566,15 +690,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work); * Returns: 0 - function was executed * 1 - function was scheduled for execution */ -int execute_in_process_context(void (*fn)(void *data), void *data, - struct execute_work *ew) +int execute_in_process_context(work_func_t fn, struct execute_work *ew) { if (!in_interrupt()) { - fn(data); + fn(&ew->work); return 0; } - INIT_WORK(&ew->work, fn, data); + INIT_WORK(&ew->work, fn); schedule_work(&ew->work); return 1; @@ -602,7 +725,6 @@ int current_is_keventd(void) } -#ifdef CONFIG_HOTPLUG_CPU /* Take the work from this (downed) CPU. */ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { @@ -635,7 +757,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, mutex_lock(&workqueue_mutex); /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (!create_workqueue_thread(wq, hotcpu)) { + if (!create_workqueue_thread(wq, hotcpu, 0)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } @@ -685,7 +807,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void init_workqueues(void) { |
