diff options
| author | Thomas Gleixner <tglx@mtd.linutronix.de> | 2005-11-06 15:36:37 +0100 |
|---|---|---|
| committer | Thomas Gleixner <tglx@mtd.linutronix.de> | 2005-11-06 15:36:37 +0100 |
| commit | 2fc2991175bf77395e6b15fe6b2304d3bf72da40 (patch) | |
| tree | b0ff38c09240e7c00e1577d447ebe89143d752dc /kernel | |
| parent | 8b491d750885ebe8e7d385ce4186c85957d67123 (diff) | |
| parent | 7015faa7df829876a0f931cd18aa6d7c24a1b581 (diff) | |
Merge branch 'master' of /home/tglx/work/mtd/git/linux-2.6.git/
Diffstat (limited to 'kernel')
48 files changed, 3709 insertions, 1791 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb05cd05d237..4f5a1453093a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -21,15 +22,16 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index 4168f631868e..2e3f4a47e7d0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -165,7 +165,7 @@ out: } /* - * Close the old accouting file (if currently open) and then replace + * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * * NOTE: acct_globals.lock MUST be held on entry and exit. @@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file) } } -/* - * sys_acct() is the only system call needed to implement process - * accounting. It takes the name of the file where accounting records - * should be written. If the filename is NULL, accounting will be - * shutdown. +/** + * sys_acct - enable/disable process accounting + * @name: file name for accounting records or NULL to shutdown accounting + * + * Returns 0 for success or negative errno values for failure. + * + * sys_acct() is the only system call needed to implement process + * accounting. It takes the name of the file where accounting records + * should be written. If the filename is NULL, accounting will be + * shutdown. */ asmlinkage long sys_acct(const char __user *name) { @@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name) return (PTR_ERR(tmp)); } /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(tmp, O_WRONLY|O_APPEND, 0); + file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); putname(tmp); if (IS_ERR(file)) { return (PTR_ERR(file)); @@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name) return (0); } -/* - * If the accouting is turned on for a file in the filesystem pointed - * to by sb, turn accouting off. +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @sb: super block for the filesystem + * + * If the accounting is turned on for a file in the filesystem pointed + * to by sb, turn accounting off. */ void acct_auto_close(struct super_block *sb) { @@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file) set_fs(fs); } -/* +/** * acct_process - now just a wrapper around do_acct_process + * @exitcode: task exit code + * + * handles process accounting for an exiting task */ void acct_process(long exitcode) { @@ -530,9 +541,9 @@ void acct_process(long exitcode) } -/* - * acct_update_integrals - * - update mm integral fields in task_struct +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { @@ -542,14 +553,14 @@ void acct_update_integrals(struct task_struct *tsk) if (delta == 0) return; tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } } -/* - * acct_clear_integrals - * - clear the mm integral fields in task_struct +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { diff --git a/kernel/audit.c b/kernel/audit.c index ef35166fdc29..0c56320d38dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -79,6 +79,8 @@ static int audit_rate_limit; /* Number of outstanding audit_buffers allowed. */ static int audit_backlog_limit = 64; +static int audit_backlog_wait_time = 60 * HZ; +static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ uid_t audit_sig_uid = -1; @@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist); static struct sk_buff_head audit_skb_queue; static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); - -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ -static LIST_HEAD(audit_tsklist); -static LIST_HEAD(audit_entlist); -static LIST_HEAD(audit_extlist); +static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* The netlink socket is only to be read by 1 CPU, which lets us assume * that list additions and deletions never happen simultaneously in * auditsc.c */ -static DECLARE_MUTEX(audit_netlink_sem); +DECLARE_MUTEX(audit_netlink_sem); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -137,6 +133,7 @@ struct audit_buffer { struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ + gfp_t gfp_mask; }; static void audit_set_pid(struct audit_buffer *ab, pid_t pid) @@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid) nlh->nlmsg_pid = pid; } -struct audit_entry { - struct list_head list; - struct audit_rule rule; -}; - static void audit_panic(const char *message) { switch (audit_failure) @@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid) { int old = audit_rate_limit; audit_rate_limit = limit; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_rate_limit=%d old=%d by auid=%u", audit_rate_limit, old, loginuid); return old; @@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid) { int old = audit_backlog_limit; audit_backlog_limit = limit; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_backlog_limit=%d old=%d by auid=%u", audit_backlog_limit, old, loginuid); return old; @@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid) if (state != 0 && state != 1) return -EINVAL; audit_enabled = state; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_enabled=%d old=%d by auid=%u", audit_enabled, old, loginuid); return old; @@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid) && state != AUDIT_FAIL_PANIC) return -EINVAL; audit_failure = state; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_failure=%d old=%d by auid=%u", audit_failure, old, loginuid); return old; @@ -281,6 +273,7 @@ int kauditd_thread(void *dummy) while (1) { skb = skb_dequeue(&audit_skb_queue); + wake_up(&audit_backlog_wait); if (skb) { if (audit_pid) { int err = netlink_unicast(audit_sock, skb, audit_pid, 0); @@ -290,7 +283,7 @@ int kauditd_thread(void *dummy) audit_pid = 0; } } else { - printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); + printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); kfree_skb(skb); } } else { @@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; audit_pid = status_get->pid; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_pid=%d old=%d by auid=%u", audit_pid, old, loginuid); } @@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - ab = audit_log_start(NULL, msg_type); - if (!ab) - break; /* audit_panic has been called */ - audit_log_format(ab, - "user pid=%d uid=%u auid=%u" - " msg='%.1024s'", - pid, uid, loginuid, (char *)data); - audit_set_pid(ab, pid); - audit_log_end(ab); + if (!audit_enabled && msg_type != AUDIT_USER_AVC) + return 0; + + err = audit_filter_user(&NETLINK_CB(skb), msg_type); + if (err == 1) { + err = 0; + ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + if (ab) { + audit_log_format(ab, + "user pid=%d uid=%u auid=%u msg='%.1024s'", + pid, uid, loginuid, (char *)data); + audit_set_pid(ab, pid); + audit_log_end(ab); + } + } break; case AUDIT_ADD: case AUDIT_DEL: @@ -514,7 +513,8 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, + THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); @@ -522,7 +522,7 @@ static int __init audit_init(void) skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; - audit_log(NULL, AUDIT_KERNEL, "initialized"); + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); return 0; } __initcall(audit_init); @@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab) } static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - int gfp_mask, int type) + gfp_t gfp_mask, int type) { unsigned long flags; struct audit_buffer *ab = NULL; @@ -586,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, goto err; ab->ctx = ctx; + ab->gfp_mask = gfp_mask; nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); nlh->nlmsg_type = type; nlh->nlmsg_flags = 0; @@ -605,26 +606,27 @@ err: * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * - * Atomic values are only guaranteed to be 24-bit, so we count down. - * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ + unsigned int audit_serial(void) { - static atomic_t serial = ATOMIC_INIT(0xffffff); - unsigned int a, b; + static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; + static unsigned int serial = 0; + + unsigned long flags; + unsigned int ret; + spin_lock_irqsave(&serial_lock, flags); do { - a = atomic_read(&serial); - if (atomic_dec_and_test(&serial)) - atomic_set(&serial, 0xffffff); - b = atomic_read(&serial); - } while (b != a - 1); + ret = ++serial; + } while (unlikely(!ret)); + spin_unlock_irqrestore(&serial_lock, flags); - return 0xffffff - b; + return ret; } static inline void audit_get_stamp(struct audit_context *ctx, @@ -644,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx, * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) + +struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, + int type) { struct audit_buffer *ab = NULL; struct timespec t; unsigned int serial; + int reserve; + unsigned long timeout_start = jiffies; if (!audit_initialized) return NULL; - if (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { + if (gfp_mask & __GFP_WAIT) + reserve = 0; + else + reserve = 5; /* Allow atomic callers to go up to five + entries over the normal backlog limit */ + + while (audit_backlog_limit + && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { + if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time + && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { + + /* Wait for auditd to drain the queue a little */ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&audit_backlog_wait, &wait); + + if (audit_backlog_limit && + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&audit_backlog_wait, &wait); + continue; + } if (audit_rate_check()) printk(KERN_WARNING "audit: audit_backlog=%d > " @@ -662,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); + audit_backlog_wait_time = audit_backlog_wait_overflow; + wake_up(&audit_backlog_wait); return NULL; } - ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); + ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; @@ -689,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int ret = pskb_expand_head(skb, skb_headroom(skb), extra, - GFP_ATOMIC); + ab->gfp_mask); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; @@ -808,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, audit_log_format(ab, " %s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ - path = kmalloc(PATH_MAX+11, GFP_KERNEL); + path = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!path) { audit_log_format(ab, "<no memory>"); return; @@ -840,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab) ab->skb = NULL; wake_up_interruptible(&kauditd_wait); } else { - printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); + printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); } } audit_buffer_free(ab); @@ -849,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab) /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, + const char *fmt, ...) { struct audit_buffer *ab; va_list args; - ab = audit_log_start(ctx, type); + ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e75f84e1a1a0..d8a68509e729 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -39,6 +39,9 @@ #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> +#include <linux/kthread.h> +#include <linux/netlink.h> +#include <linux/compiler.h> #include <asm/unistd.h> /* 0 = no checking @@ -95,6 +98,7 @@ struct audit_names { uid_t uid; gid_t gid; dev_t rdev; + unsigned flags; }; struct audit_aux_data { @@ -167,9 +171,16 @@ struct audit_context { /* There are three lists of rules -- one to search at task creation * time, one to search at syscall entry time, and another to search at * syscall exit time. */ -static LIST_HEAD(audit_tsklist); -static LIST_HEAD(audit_entlist); -static LIST_HEAD(audit_extlist); +static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_filter_list[0]), + LIST_HEAD_INIT(audit_filter_list[1]), + LIST_HEAD_INIT(audit_filter_list[2]), + LIST_HEAD_INIT(audit_filter_list[3]), + LIST_HEAD_INIT(audit_filter_list[4]), +#if AUDIT_NR_FILTERS != 5 +#error Fix audit_filter_list initialiser +#endif +}; struct audit_entry { struct list_head list; @@ -179,9 +190,36 @@ struct audit_entry { extern int audit_pid; +/* Copy rule from user-space to kernel-space. Called from + * audit_add_rule during AUDIT_ADD. */ +static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +{ + int i; + + if (s->action != AUDIT_NEVER + && s->action != AUDIT_POSSIBLE + && s->action != AUDIT_ALWAYS) + return -1; + if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) + return -1; + if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) + return -1; + + d->flags = s->flags; + d->action = s->action; + d->field_count = s->field_count; + for (i = 0; i < d->field_count; i++) { + d->fields[i] = s->fields[i]; + d->values[i] = s->values[i]; + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + return 0; +} + /* Check to see if two rules are identical. It is called from + * audit_add_rule during AUDIT_ADD and * audit_del_rule during AUDIT_DEL. */ -static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) { int i; @@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) /* Note that audit_add_rule and audit_del_rule are called via * audit_receive() in audit.c, and are protected by * audit_netlink_sem. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_add_rule(struct audit_rule *rule, + struct list_head *list) { - if (entry->rule.flags & AUDIT_PREPEND) { - entry->rule.flags &= ~AUDIT_PREPEND; + struct audit_entry *entry; + + /* Do not use the _rcu iterator here, since this is the only + * addition routine. */ + list_for_each_entry(entry, list, list) { + if (!audit_compare_rule(rule, &entry->rule)) { + return -EEXIST; + } + } + + if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) + return -ENOMEM; + if (audit_copy_rule(&entry->rule, rule)) { + kfree(entry); + return -EINVAL; + } + + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; list_add_rcu(&entry->list, list); } else { list_add_tail_rcu(&entry->list, list); } + return 0; } -static void audit_free_rule(struct rcu_head *head) +static inline void audit_free_rule(struct rcu_head *head) { struct audit_entry *e = container_of(head, struct audit_entry, rcu); kfree(e); @@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule, return 0; } } - return -EFAULT; /* No matching rule */ + return -ENOENT; /* No matching rule */ } -/* Copy rule from user-space to kernel-space. Called during - * AUDIT_ADD. */ -static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +static int audit_list_rules(void *_dest) { + int pid, seq; + int *dest = _dest; + struct audit_entry *entry; int i; - if (s->action != AUDIT_NEVER - && s->action != AUDIT_POSSIBLE - && s->action != AUDIT_ALWAYS) - return -1; - if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) - return -1; + pid = dest[0]; + seq = dest[1]; + kfree(dest); - d->flags = s->flags; - d->action = s->action; - d->field_count = s->field_count; - for (i = 0; i < d->field_count; i++) { - d->fields[i] = s->fields[i]; - d->values[i] = s->values[i]; + down(&audit_netlink_sem); + + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_sem held. */ + for (i=0; i<AUDIT_NR_FILTERS; i++) { + list_for_each_entry(entry, &audit_filter_list[i], list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + + up(&audit_netlink_sem); return 0; } int audit_receive_filter(int type, int pid, int uid, int seq, void *data, uid_t loginuid) { - u32 flags; - struct audit_entry *entry; + struct task_struct *tsk; + int *dest; int err = 0; + unsigned listnr; switch (type) { case AUDIT_LIST: - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_sem held. */ - list_for_each_entry(entry, &audit_tsklist, list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - list_for_each_entry(entry, &audit_entlist, list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - list_for_each_entry(entry, &audit_extlist, list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + /* We can't just spew out the rules here because we might fill + * the available socket buffer space and deadlock waiting for + * auditctl to read from it... which isn't ever going to + * happen if we're actually running in the context of auditctl + * trying to _send_ the stuff */ + + dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest[0] = pid; + dest[1] = seq; + + tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); + if (IS_ERR(tsk)) { + kfree(dest); + err = PTR_ERR(tsk); + } break; case AUDIT_ADD: - if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) - return -ENOMEM; - if (audit_copy_rule(&entry->rule, data)) { - kfree(entry); + listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; + if (listnr >= AUDIT_NR_FILTERS) return -EINVAL; - } - flags = entry->rule.flags; - if (!err && (flags & AUDIT_PER_TASK)) - err = audit_add_rule(entry, &audit_tsklist); - if (!err && (flags & AUDIT_AT_ENTRY)) - err = audit_add_rule(entry, &audit_entlist); - if (!err && (flags & AUDIT_AT_EXIT)) - err = audit_add_rule(entry, &audit_extlist); - audit_log(NULL, AUDIT_CONFIG_CHANGE, - "auid=%u added an audit rule\n", loginuid); + + err = audit_add_rule(data, &audit_filter_list[listnr]); + if (!err) + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u added an audit rule\n", loginuid); break; case AUDIT_DEL: - flags =((struct audit_rule *)data)->flags; - if (!err && (flags & AUDIT_PER_TASK)) - err = audit_del_rule(data, &audit_tsklist); - if (!err && (flags & AUDIT_AT_ENTRY)) - err = audit_del_rule(data, &audit_entlist); - if (!err && (flags & AUDIT_AT_EXIT)) - err = audit_del_rule(data, &audit_extlist); - audit_log(NULL, AUDIT_CONFIG_CHANGE, - "auid=%u removed an audit rule\n", loginuid); + listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; + if (listnr >= AUDIT_NR_FILTERS) + return -EINVAL; + + err = audit_del_rule(data, &audit_filter_list[listnr]); + if (!err) + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u removed an audit rule\n", loginuid); break; default: return -EINVAL; @@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk, result = (ctx->return_code == value); break; case AUDIT_SUCCESS: - if (ctx && ctx->return_valid) - result = (ctx->return_valid == AUDITSC_SUCCESS); + if (ctx && ctx->return_valid) { + if (value) + result = (ctx->return_valid == AUDITSC_SUCCESS); + else + result = (ctx->return_valid == AUDITSC_FAILURE); + } break; case AUDIT_DEVMAJOR: if (ctx) { @@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) enum audit_state state; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_tsklist, list) { + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { rcu_read_unlock(); return state; @@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, struct list_head *list) { struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + if (!list_empty(list)) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit + && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + rcu_read_unlock(); + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +static int audit_filter_user_rules(struct netlink_skb_parms *cb, + struct audit_rule *rule, + enum audit_state *state) +{ + int i; + + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_NEGATE; + u32 value = rule->values[i]; + int result = 0; + + switch (field) { + case AUDIT_PID: + result = (cb->creds.pid == value); + break; + case AUDIT_UID: + result = (cb->creds.uid == value); + break; + case AUDIT_GID: + result = (cb->creds.gid == value); + break; + case AUDIT_LOGINUID: + result = (cb->loginuid == value); + break; + } + + if (rule->fields[i] & AUDIT_NEGATE) + result = !result; + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +int audit_filter_user(struct netlink_skb_parms *cb, int type) +{ + struct audit_entry *e; enum audit_state state; - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); + int ret = 1; rcu_read_lock(); - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { - rcu_read_unlock(); - return state; + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { + if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (state == AUDIT_DISABLED) + ret = 0; + break; } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; + + return ret; /* Audit by default */ } /* This should be called with task_lock() held. */ @@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, if (context->in_syscall && !context->auditable) { enum audit_state state; - state = audit_filter_syscall(tsk, context, &audit_extlist); + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); if (state == AUDIT_RECORD_CONTEXT) context->auditable = 1; } @@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab) up_read(&mm->mmap_sem); } -static void audit_log_exit(struct audit_context *context) +static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) { int i; struct audit_buffer *ab; struct audit_aux_data *aux; - ab = audit_log_start(context, AUDIT_SYSCALL); + ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); if (!ab) return; /* audit_panic has been called */ audit_log_format(ab, "arch=%x syscall=%d", @@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context) for (aux = context->aux; aux; aux = aux->next) { - ab = audit_log_start(context, aux->type); + ab = audit_log_start(context, GFP_KERNEL, aux->type); if (!ab) continue; /* audit_panic has been called */ @@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context) } if (context->pwd && context->pwdmnt) { - ab = audit_log_start(context, AUDIT_CWD); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); audit_log_end(ab); } } for (i = 0; i < context->name_count; i++) { - ab = audit_log_start(context, AUDIT_PATH); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ @@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context) audit_log_format(ab, " name="); audit_log_untrustedstring(ab, context->names[i].name); } + audit_log_format(ab, " flags=%x\n", context->names[i].flags); + if (context->names[i].ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" " ouid=%u ogid=%u rdev=%02x:%02x", @@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk) return; /* Check for system calls that do not go through the exit - * function (e.g., exit_group), then free context block. */ - if (context->in_syscall && context->auditable && context->pid != audit_pid) - audit_log_exit(context); + * function (e.g., exit_group), then free context block. + * We use GFP_ATOMIC here because we might be doing this + * in the context of the idle thread */ + if (context->in_syscall && context->auditable) + audit_log_exit(context, GFP_ATOMIC); audit_free_context(context); } @@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, state = context->state; if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) - state = audit_filter_syscall(tsk, context, &audit_entlist); + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); if (likely(state == AUDIT_DISABLED)) return; - context->serial = audit_serial(); + context->serial = 0; context->ctime = CURRENT_TIME; context->in_syscall = 1; context->auditable = !!(state == AUDIT_RECORD_CONTEXT); @@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) /* Not having a context here is ok, since the parent may have * called __put_task_struct. */ if (likely(!context)) - return; + goto out; - if (context->in_syscall && context->auditable && context->pid != audit_pid) - audit_log_exit(context); + if (context->in_syscall && context->auditable) + audit_log_exit(context, GFP_KERNEL); context->in_syscall = 0; context->auditable = 0; @@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) } else { audit_free_names(context); audit_free_aux(context); - audit_zero_context(context, context->state); tsk->audit_context = context; } + out: put_task_struct(tsk); } @@ -996,7 +1124,7 @@ void audit_putname(const char *name) /* Store the inode and device from a lookup. Called from * fs/namei.c:path_lookup(). */ -void audit_inode(const char *name, const struct inode *inode) +void audit_inode(const char *name, const struct inode *inode, unsigned flags) { int idx; struct audit_context *context = current->audit_context; @@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode) ++context->ino_count; #endif } - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; + context->names[idx].flags = flags; + context->names[idx].ino = inode->i_ino; + context->names[idx].dev = inode->i_sb->s_dev; + context->names[idx].mode = inode->i_mode; + context->names[idx].uid = inode->i_uid; + context->names[idx].gid = inode->i_gid; + context->names[idx].rdev = inode->i_rdev; } void auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial) { + if (!ctx->serial) + ctx->serial = audit_serial(); t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; @@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) if (task->audit_context) { struct audit_buffer *ab; - ab = audit_log_start(NULL, AUDIT_LOGIN); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (ab) { audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u", @@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t) extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; - if (unlikely(audit_pid && t->pid == audit_pid)) { + if (unlikely(audit_pid && t->tgid == audit_pid)) { if (sig == SIGTERM || sig == SIGHUP) { struct audit_context *ctx = current->audit_context; audit_sig_pid = current->pid; diff --git a/kernel/compat.c b/kernel/compat.c index ddfcaaa86623..102296e21ea8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); if (expire == 0) return 0; @@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); if (expire == 0) return 0; @@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &s, &info); diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12e..3619e939182e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,7 @@ /* This protects CPUs going up and down... */ DECLARE_MUTEX(cpucontrol); +EXPORT_SYMBOL_GPL(cpucontrol); static struct notifier_block *cpu_chain; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 805fb9097318..5a737ed9dac7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -32,6 +32,7 @@ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/list.h> +#include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mount.h> @@ -60,6 +61,9 @@ struct cpuset { cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + /* + * Count is atomic so can incr (fork) or decr (exit) without a lock. + */ atomic_t count; /* count tasks using this cpuset */ /* @@ -142,44 +146,91 @@ static struct vfsmount *cpuset_mount; static struct super_block *cpuset_sb = NULL; /* - * cpuset_sem should be held by anyone who is depending on the children - * or sibling lists of any cpuset, or performing non-atomic operations - * on the flags or *_allowed values of a cpuset, such as raising the - * CS_REMOVED flag bit iff it is not already raised, or reading and - * conditionally modifying the *_allowed values. One kernel global - * cpuset semaphore should be sufficient - these things don't change - * that much. - * - * The code that modifies cpusets holds cpuset_sem across the entire - * operation, from cpuset_common_file_write() down, single threading - * all cpuset modifications (except for counter manipulations from - * fork and exit) across the system. This presumes that cpuset - * modifications are rare - better kept simple and safe, even if slow. - * - * The code that reads cpusets, such as in cpuset_common_file_read() - * and below, only holds cpuset_sem across small pieces of code, such - * as when reading out possibly multi-word cpumasks and nodemasks, as - * the risks are less, and the desire for performance a little greater. - * The proc_cpuset_show() routine needs to hold cpuset_sem to insure - * that no cs->dentry is NULL, as it walks up the cpuset tree to root. - * - * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't - * (usually) grab cpuset_sem. These are the two most performance - * critical pieces of code here. The exception occurs on exit(), - * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * We have two global cpuset semaphores below. They can nest. + * It is ok to first take manage_sem, then nest callback_sem. We also + * require taking task_lock() when dereferencing a tasks cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both semaphores to modify cpusets. If a task + * holds manage_sem, then it blocks others wanting that semaphore, + * ensuring that it is the only task able to also acquire callback_sem + * and be able to modify cpusets. It can perform various checks on + * the cpuset structure first, knowing nothing will change. It can + * also allocate memory while just holding manage_sem. While it is + * performing these checks, various callback routines can briefly + * acquire callback_sem to query cpusets. Once it is ready to make + * the changes, it takes callback_sem, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_sem, as that would risk double tripping on callback_sem + * from one of the callbacks into the cpuset code from within + * __alloc_pages(). + * + * If a task is only holding callback_sem, then it has read-only + * access to cpusets. + * + * The task_struct fields mems_allowed and mems_generation may only + * be accessed in the context of that task, so require no locks. + * + * Any task can increment and decrement the count field without lock. + * So in general, code holding manage_sem or callback_sem can't rely + * on the count field not changing. However, if the count goes to + * zero, then only attach_task(), which holds both semaphores, can + * increment it again. Because a count of zero means that no tasks + * are currently attached, therefore there is no way a task attached + * to that cpuset can fork (the other way to increment the count). + * So code holding manage_sem or callback_sem can safely assume that + * if the count is zero, it will stay zero. Similarly, if a task + * holds manage_sem or callback_sem on a cpuset with zero count, it + * knows that the cpuset won't be removed, as cpuset_rmdir() needs + * both of those semaphores. + * + * A possible optimization to improve parallelism would be to make + * callback_sem a R/W semaphore (rwsem), allowing the callback routines + * to proceed in parallel, with read access, until the holder of + * manage_sem needed to take this rwsem for exclusive write access + * and modify some cpusets. + * + * The cpuset_common_file_write handler for operations that modify + * the cpuset hierarchy holds manage_sem across the entire operation, + * single threading all such cpuset modifications across the system. + * + * The cpuset_common_file_read() handlers only hold callback_sem across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + * + * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't + * (usually) take either semaphore. These are the two most performance + * critical pieces of code here. The exception occurs on cpuset_exit(), + * when a task in a notify_on_release cpuset exits. Then manage_sem * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * - * A cpuset can only be deleted if both its 'count' of using tasks is - * zero, and its list of 'children' cpusets is empty. Since all tasks - * in the system use _some_ cpuset, and since there is always at least - * one task in the system (init, pid == 1), therefore, top_cpuset - * always has either children cpusets and/or using tasks. So no need - * for any special hack to ensure that top_cpuset cannot be deleted. + * A cpuset can only be deleted if both its 'count' of using tasks + * is zero, and its list of 'children' cpusets is empty. Since all + * tasks in the system use _some_ cpuset, and since there is always at + * least one task in the system (init, pid == 1), therefore, top_cpuset + * always has either children cpusets and/or using tasks. So we don't + * need a special hack to ensure that top_cpuset cannot be deleted. + * + * The above "Tale of Two Semaphores" would be complete, but for: + * + * The task_lock() exception + * + * The need for this exception arises from the action of attach_task(), + * which overwrites one tasks cpuset pointer with another. It does + * so using both semaphores, however there are several performance + * critical places that need to reference task->cpuset without the + * expense of grabbing a system global semaphore. Therefore except as + * noted below, when dereferencing or, as in attach_task(), modifying + * a tasks cpuset pointer we use task_lock(), which acts on a spinlock + * (task->alloc_lock) already in the task_struct routinely used for + * such matters. */ -static DECLARE_MUTEX(cpuset_sem); +static DECLARE_MUTEX(manage_sem); +static DECLARE_MUTEX(callback_sem); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -354,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) } /* - * Call with cpuset_sem held. Writes path of cpuset into buf. + * Call with manage_sem held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */ @@ -398,21 +449,32 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * - * Note final arg to call_usermodehelper() is 0 - that means - * don't wait. Since we are holding the global cpuset_sem here, - * and we are asking another thread (started from keventd) to rmdir a - * cpuset, we can't wait - or we'd deadlock with the removing thread - * on cpuset_sem. + * The final arg to call_usermodehelper() is 0, which means don't + * wait. The separate /sbin/cpuset_release_agent task is forked by + * call_usermodehelper(), then control in this thread returns here, + * without waiting for the release agent task. We don't bother to + * wait because the caller of this routine has no use for the exit + * status of the /sbin/cpuset_release_agent task, so no sense holding + * our caller up for that. + * + * When we had only one cpuset semaphore, we had to call this + * without holding it, to avoid deadlock when call_usermodehelper() + * allocated memory. With two locks, we could now call this while + * holding manage_sem, but we still don't, so as to minimize + * the time manage_sem is held. */ -static int cpuset_release_agent(char *cpuset_str) +static void cpuset_release_agent(const char *pathbuf) { char *argv[3], *envp[3]; int i; + if (!pathbuf) + return; + i = 0; argv[i++] = "/sbin/cpuset_release_agent"; - argv[i++] = cpuset_str; + argv[i++] = (char *)pathbuf; argv[i] = NULL; i = 0; @@ -421,17 +483,29 @@ static int cpuset_release_agent(char *cpuset_str) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - return call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, 0); + kfree(pathbuf); } /* * Either cs->count of using tasks transitioned to zero, or the * cs->children list of child cpusets just became empty. If this * cs is notify_on_release() and now both the user count is zero and - * the list of children is empty, send notice to user land. + * the list of children is empty, prepare cpuset path in a kmalloc'd + * buffer, to be returned via ppathbuf, so that the caller can invoke + * cpuset_release_agent() with it later on, once manage_sem is dropped. + * Call here with manage_sem held. + * + * This check_for_release() routine is responsible for kmalloc'ing + * pathbuf. The above cpuset_release_agent() is responsible for + * kfree'ing pathbuf. The caller of these routines is responsible + * for providing a pathbuf pointer, initialized to NULL, then + * calling check_for_release() with manage_sem held and the address + * of the pathbuf pointer, then dropping manage_sem, then calling + * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ -static void check_for_release(struct cpuset *cs) +static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && list_empty(&cs->children)) { @@ -441,10 +515,9 @@ static void check_for_release(struct cpuset *cs) if (!buf) return; if (cpuset_path(cs, buf, PAGE_SIZE) < 0) - goto out; - cpuset_release_agent(buf); -out: - kfree(buf); + kfree(buf); + else + *ppathbuf = buf; } } @@ -459,7 +532,7 @@ out: * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) @@ -483,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) * One way or another, we guarantee to return some non-empty subset * of node_online_map. * - * Call with cpuset_sem held. + * Call with callback_sem held. */ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) @@ -498,31 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) } /* - * Refresh current tasks mems_allowed and mems_generation from - * current tasks cpuset. Call with cpuset_sem held. - * - * Be sure to call refresh_mems() on any cpuset operation which - * (1) holds cpuset_sem, and (2) might possibly alloc memory. - * Call after obtaining cpuset_sem lock, before any possible - * allocation. Otherwise one risks trying to allocate memory - * while the task cpuset_mems_generation is not the same as - * the mems_generation in its cpuset, which would deadlock on - * cpuset_sem in cpuset_update_current_mems_allowed(). - * - * Since we hold cpuset_sem, once refresh_mems() is called, the - * test (current->cpuset_mems_generation != cs->mems_generation) - * in cpuset_update_current_mems_allowed() will remain false, - * until we drop cpuset_sem. Anyone else who would change our - * cpusets mems_generation needs to lock cpuset_sem first. + * Refresh current tasks mems_allowed and mems_generation from current + * tasks cpuset. + * + * Call without callback_sem or task_lock() held. May be called with + * or without manage_sem held. Will acquire task_lock() and might + * acquire callback_sem during call. + * + * The task_lock() is required to dereference current->cpuset safely. + * Without it, we could pick up the pointer value of current->cpuset + * in one instruction, and then attach_task could give us a different + * cpuset, and then the cpuset we had could be removed and freed, + * and then on our next instruction, we could dereference a no longer + * valid cpuset pointer to get its mems_generation field. + * + * This routine is needed to update the per-task mems_allowed data, + * within the tasks context, when it is trying to allocate memory + * (in various mm/mempolicy.c routines) and notices that some other + * task has been modifying its cpuset. */ static void refresh_mems(void) { - struct cpuset *cs = current->cpuset; + int my_cpusets_mem_gen; + + task_lock(current); + my_cpusets_mem_gen = current->cpuset->mems_generation; + task_unlock(current); + + if (current->cpuset_mems_generation != my_cpusets_mem_gen) { + struct cpuset *cs; + nodemask_t oldmem = current->mems_allowed; - if (current->cpuset_mems_generation != cs->mems_generation) { + down(&callback_sem); + task_lock(current); + cs = current->cpuset; guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; + task_unlock(current); + up(&callback_sem); + if (!nodes_equal(oldmem, current->mems_allowed)) + numa_policy_rebind(&oldmem, ¤t->mems_allowed); } } @@ -531,7 +620,7 @@ static void refresh_mems(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. + * are only set if the other's are set. Call holding manage_sem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -549,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_sem held. + * manage_sem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -603,9 +692,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * - * Call with cpuset_sem held. May nest a call to the + * Call with manage_sem held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ + static void update_cpu_domains(struct cpuset *cur) { struct cpuset *c, *par = cur->parent; @@ -647,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) unlock_cpu_hotplug(); } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -663,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + down(&callback_sem); cs->cpus_allowed = trialcs.cpus_allowed; + up(&callback_sem); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0; } +/* + * Call with manage_sem held. May take callback_sem during call. + */ + static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; @@ -683,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval == 0) { + down(&callback_sem); cs->mems_allowed = trialcs.mems_allowed; atomic_inc(&cpuset_mems_generation); cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); } return retval; } @@ -696,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) * CS_NOTIFY_ON_RELEASE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 + * + * Call with manage_sem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -717,24 +821,35 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); + down(&callback_sem); if (turning_on) set_bit(bit, &cs->flags); else clear_bit(bit, &cs->flags); + up(&callback_sem); if (cpu_exclusive_changed) update_cpu_domains(cs); return 0; } -static int attach_task(struct cpuset *cs, char *buf) +/* + * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly + * writing the path of the old cpuset in 'ppathbuf' if it needs to be + * notified on release. + * + * Call holding manage_sem. May take callback_sem and task_lock of + * the task 'pid' during call. + */ + +static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; - if (sscanf(buf, "%d", &pid) != 1) + if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -743,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *buf) read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); - if (!tsk) { + if (!tsk || tsk->flags & PF_EXITING) { read_unlock(&tasklist_lock); return -ESRCH; } @@ -761,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *buf) get_task_struct(tsk); } + down(&callback_sem); + task_lock(tsk); oldcs = tsk->cpuset; if (!oldcs) { task_unlock(tsk); + up(&callback_sem); put_task_struct(tsk); return -ESRCH; } @@ -775,9 +893,10 @@ static int attach_task(struct cpuset *cs, char *buf) guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + up(&callback_sem); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) - check_for_release(oldcs); + check_for_release(oldcs, ppathbuf); return 0; } @@ -801,6 +920,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us struct cftype *cft = __d_cft(file->f_dentry); cpuset_filetype_t type = cft->private; char *buffer; + char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ @@ -817,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - down(&cpuset_sem); + down(&manage_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -841,7 +961,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_TASKLIST: - retval = attach_task(cs, buffer); + retval = attach_task(cs, buffer, &pathbuf); break; default: retval = -EINVAL; @@ -851,7 +971,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - up(&cpuset_sem); + up(&manage_sem); + cpuset_release_agent(pathbuf); out1: kfree(buffer); return retval; @@ -890,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - down(&cpuset_sem); + down(&callback_sem); mask = cs->cpus_allowed; - up(&cpuset_sem); + up(&callback_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -901,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - down(&cpuset_sem); + down(&callback_sem); mask = cs->mems_allowed; - up(&cpuset_sem); + up(&callback_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -917,8 +1038,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, char *page; ssize_t retval = 0; char *s; - char *start; - size_t n; if (!(page = (char *)__get_free_page(GFP_KERNEL))) return -ENOMEM; @@ -946,12 +1065,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, goto out; } *s++ = '\n'; - *s = '\0'; - start = page + *ppos; - n = s - start; - retval = n - copy_to_user(buf, start, min(n, nbytes)); - *ppos += retval; + retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); out: free_page((unsigned long)page); return retval; @@ -1002,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) return 0; } +/* + * cpuset_rename - Only allow simple rename of directories in place. + */ +static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) + return -ENOTDIR; + if (new_dentry->d_inode) + return -EEXIST; + if (old_dir != new_dir) + return -EIO; + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct file_operations cpuset_file_operations = { .read = cpuset_file_read, .write = cpuset_file_write, @@ -1014,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cpuset_mkdir, .rmdir = cpuset_rmdir, + .rename = cpuset_rename, }; static int cpuset_create_file(struct dentry *dentry, int mode) @@ -1117,7 +1248,9 @@ struct ctr_struct { /* * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. - * Return actual number of pids loaded. + * Return actual number of pids loaded. No need to task_lock(p) + * when reading out p->cpuset, as we don't really care if it changes + * on the next cycle, and we are not going to try to dereference it. */ static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) { @@ -1159,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) return cnt; } +/* + * Handle an open on 'tasks' file. Prepare a buffer listing the + * process id's of tasks currently attached to the cpuset being opened. + * + * Does not require any specific cpuset semaphores, and does not take any. + */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { struct cpuset *cs = __d_cs(file->f_dentry->d_parent); @@ -1306,7 +1445,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - down(&cpuset_sem); + down(&manage_sem); refresh_mems(); cs->flags = 0; if (notify_on_release(parent)) @@ -1321,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->parent = parent; + down(&callback_sem); list_add(&cs->sibling, &cs->parent->children); + up(&callback_sem); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* - * Release cpuset_sem before cpuset_populate_dir() because it + * Release manage_sem before cpuset_populate_dir() because it * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - up(&cpuset_sem); + up(&manage_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - up(&cpuset_sem); + up(&manage_sem); kfree(cs); return err; } @@ -1357,33 +1498,37 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; + char *pathbuf = NULL; /* the vfs holds both inode->i_sem already */ - down(&cpuset_sem); + down(&manage_sem); refresh_mems(); if (atomic_read(&cs->count) > 0) { - up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - up(&cpuset_sem); + up(&manage_sem); return -EBUSY; } parent = cs->parent; + down(&callback_sem); set_bit(CS_REMOVED, &cs->flags); if (is_cpu_exclusive(cs)) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ - if (list_empty(&parent->children)) - check_for_release(parent); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - up(&cpuset_sem); + up(&callback_sem); + if (list_empty(&parent->children)) + check_for_release(parent, &pathbuf); + up(&manage_sem); + cpuset_release_agent(pathbuf); return 0; } @@ -1442,16 +1587,26 @@ void __init cpuset_init_smp(void) * cpuset_fork - attach newly forked task to its parents cpuset. * @tsk: pointer to task_struct of forking parent process. * - * Description: By default, on fork, a task inherits its - * parent's cpuset. The pointer to the shared cpuset is - * automatically copied in fork.c by dup_task_struct(). - * This cpuset_fork() routine need only increment the usage - * counter in that cpuset. + * Description: A task inherits its parent's cpuset at fork(). + * + * A pointer to the shared cpuset was automatically copied in fork.c + * by dup_task_struct(). However, we ignore that copy, since it was + * not made under the protection of task_lock(), so might no longer be + * a valid cpuset pointer. attach_task() might have already changed + * current->cpuset, allowing the previously referenced cpuset to + * be removed and freed. Instead, we task_lock(current) and copy + * its present value of current->cpuset for our freshly forked child. + * + * At the point that cpuset_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. **/ -void cpuset_fork(struct task_struct *tsk) +void cpuset_fork(struct task_struct *child) { - atomic_inc(&tsk->cpuset->count); + task_lock(current); + child->cpuset = current->cpuset; + atomic_inc(&child->cpuset->count); + task_unlock(current); } /** @@ -1460,33 +1615,43 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * - * Note that cpusets marked notify_on_release force every task - * in them to take the global cpuset_sem semaphore when exiting. - * This could impact scaling on very large systems. Be reluctant - * to use notify_on_release cpusets where very high task exit - * scaling is required on large systems. - * - * Don't even think about derefencing 'cs' after the cpuset use - * count goes to zero, except inside a critical section guarded - * by the cpuset_sem semaphore. If you don't hold cpuset_sem, - * then a zero cpuset use count is a license to any other task to - * nuke the cpuset immediately. + * Note that cpusets marked notify_on_release force every task in + * them to take the global manage_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant to + * use notify_on_release cpusets where very high task exit scaling + * is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use count + * goes to zero, except inside a critical section guarded by manage_sem + * or callback_sem. Otherwise a zero cpuset use count is a license to + * any other task to nuke the cpuset immediately, via cpuset_rmdir(). + * + * This routine has to take manage_sem, not callback_sem, because + * it is holding that semaphore while calling check_for_release(), + * which calls kmalloc(), so can't be called holding callback__sem(). + * + * We don't need to task_lock() this reference to tsk->cpuset, + * because tsk is already marked PF_EXITING, so attach_task() won't + * mess with it. **/ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; - task_lock(tsk); + BUG_ON(!(tsk->flags & PF_EXITING)); + cs = tsk->cpuset; tsk->cpuset = NULL; - task_unlock(tsk); if (notify_on_release(cs)) { - down(&cpuset_sem); + char *pathbuf = NULL; + + down(&manage_sem); if (atomic_dec_and_test(&cs->count)) - check_for_release(cs); - up(&cpuset_sem); + check_for_release(cs, &pathbuf); + up(&manage_sem); + cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); } @@ -1506,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) { cpumask_t mask; - down(&cpuset_sem); + down(&callback_sem); task_lock((struct task_struct *)tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); - up(&cpuset_sem); + up(&callback_sem); return mask; } @@ -1526,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) * If the current tasks cpusets mems_allowed changed behind our backs, * update current->mems_allowed and mems_generation to the new value. * Do not call this routine if in_interrupt(). + * + * Call without callback_sem or task_lock() held. May be called + * with or without manage_sem held. Unless exiting, it will acquire + * task_lock(). Also might acquire callback_sem during call to + * refresh_mems(). */ void cpuset_update_current_mems_allowed(void) { - struct cpuset *cs = current->cpuset; + struct cpuset *cs; + int need_to_refresh = 0; + task_lock(current); + cs = current->cpuset; if (!cs) - return; /* task is exiting */ - if (current->cpuset_mems_generation != cs->mems_generation) { - down(&cpuset_sem); + goto done; + if (current->cpuset_mems_generation != cs->mems_generation) + need_to_refresh = 1; +done: + task_unlock(current); + if (need_to_refresh) refresh_mems(); - up(&cpuset_sem); - } } /** @@ -1570,23 +1744,135 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) return 0; } +/* + * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive + * ancestor to the specified cpuset. Call holding callback_sem. + * If no ancestor is mem_exclusive (an unusual configuration), then + * returns the root cpuset. + */ +static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +{ + while (!is_mem_exclusive(cs) && cs->parent) + cs = cs->parent; + return cs; +} + /** - * cpuset_zone_allowed - is zone z allowed in current->mems_allowed - * @z: zone in question + * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) * - * Is zone z allowed in current->mems_allowed, or is - * the CPU in interrupt context? (zone is always allowed in this case) - */ -int cpuset_zone_allowed(struct zone *z) + * If we're in interrupt, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. If it's not a + * __GFP_HARDWALL request and this zone's nodes is in the nearest + * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * Otherwise, no. + * + * GFP_USER allocations are marked with the __GFP_HARDWALL bit, + * and do not allow allocations outside the current tasks cpuset. + * GFP_KERNEL allocations are not so marked, so can escape to the + * nearest mem_exclusive ancestor cpuset. + * + * Scanning up parent cpusets requires callback_sem. The __alloc_pages() + * routine only calls here with __GFP_HARDWALL bit _not_ set if + * it's a GFP_KERNEL allocation, and all nodes in the current tasks + * mems_allowed came up empty on the first pass over the zonelist. + * So only GFP_KERNEL allocations, if all nodes in the cpuset are + * short of memory, might require taking the callback_sem semaphore. + * + * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() + * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing + * hardwall cpusets - no allocation on a node outside the cpuset is + * allowed (unless in interrupt, of course). + * + * The second loop doesn't even call here for GFP_ATOMIC requests + * (if the __alloc_pages() local variable 'wait' is set). That check + * and the checks below have the combined affect in the second loop of + * the __alloc_pages() routine that: + * in_interrupt - any node ok (current task context irrelevant) + * GFP_ATOMIC - any node ok + * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_USER - only nodes in current tasks mems allowed ok. + **/ + +int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + int node; /* node that zone z is on */ + const struct cpuset *cs; /* current cpuset ancestors */ + int allowed = 1; /* is allocation in zone z allowed? */ + + if (in_interrupt()) + return 1; + node = z->zone_pgdat->node_id; + if (node_isset(node, current->mems_allowed)) + return 1; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return 0; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ + down(&callback_sem); + + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return 1; + task_lock(current); + cs = nearest_exclusive_ancestor(current->cpuset); + task_unlock(current); + + allowed = node_isset(node, cs->mems_allowed); + up(&callback_sem); + return allowed; +} + +/** + * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? + * @p: pointer to task_struct of some other task. + * + * Description: Return true if the nearest mem_exclusive ancestor + * cpusets of tasks @p and current overlap. Used by oom killer to + * determine if task @p's memory usage might impact the memory + * available to the current task. + * + * Acquires callback_sem - not suitable for calling from a fast path. + **/ + +int cpuset_excl_nodes_overlap(const struct task_struct *p) { - return in_interrupt() || - node_isset(z->zone_pgdat->node_id, current->mems_allowed); + const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ + int overlap = 0; /* do cpusets overlap? */ + + down(&callback_sem); + + task_lock(current); + if (current->flags & PF_EXITING) { + task_unlock(current); + goto done; + } + cs1 = nearest_exclusive_ancestor(current->cpuset); + task_unlock(current); + + task_lock((struct task_struct *)p); + if (p->flags & PF_EXITING) { + task_unlock((struct task_struct *)p); + goto done; + } + cs2 = nearest_exclusive_ancestor(p->cpuset); + task_unlock((struct task_struct *)p); + + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); +done: + up(&callback_sem); + + return overlap; } /* * proc_cpuset_show() * - Print tasks cpuset path into seq_file. * - Used for /proc/<pid>/cpuset. + * - No need to task_lock(tsk) on this tsk->cpuset reference, as it + * doesn't really matter if tsk->cpuset changes after we read it, + * and we take manage_sem, keeping attach_task() from changing it + * anyway. */ static int proc_cpuset_show(struct seq_file *m, void *v) @@ -1601,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - down(&cpuset_sem); - task_lock(tsk); + down(&manage_sem); cs = tsk->cpuset; - task_unlock(tsk); if (!cs) { retval = -EINVAL; goto out; @@ -1616,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - up(&cpuset_sem); + up(&manage_sem); kfree(buf); return retval; } diff --git a/kernel/exit.c b/kernel/exit.c index 9d1b10ed0135..537394b25e8d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -368,17 +368,25 @@ EXPORT_SYMBOL(daemonize); static inline void close_files(struct files_struct * files) { int i, j; + struct fdtable *fdt; j = 0; + + /* + * It is safe to dereference the fd table without RCU or + * ->file_lock because this is the last reference to the + * files structure. + */ + fdt = files_fdtable(files); for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= files->max_fdset || i >= files->max_fds) + if (i >= fdt->max_fdset || i >= fdt->max_fds) break; - set = files->open_fds->fds_bits[j++]; + set = fdt->open_fds->fds_bits[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&files->fd[i], NULL); + struct file * file = xchg(&fdt->fd[i], NULL); if (file) filp_close(file, files); } @@ -403,18 +411,22 @@ struct files_struct *get_files_struct(struct task_struct *task) void fastcall put_files_struct(struct files_struct *files) { + struct fdtable *fdt; + if (atomic_dec_and_test(&files->count)) { close_files(files); /* * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. */ - if (files->fd != &files->fd_array[0]) - free_fd_array(files->fd, files->max_fds); - if (files->max_fdset > __FD_SETSIZE) { - free_fdset(files->open_fds, files->max_fdset); - free_fdset(files->close_on_exec, files->max_fdset); - } - kmem_cache_free(files_cachep, files); + fdt = files_fdtable(files); + if (fdt == &files->fdtab) + fdt->free_files = files; + else + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); } } @@ -535,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) if (p->pdeath_signal) /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, (void *) 0, p); + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) { @@ -579,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { - __kill_pg_info(SIGHUP, (void *)1, pgrp); - __kill_pg_info(SIGCONT, (void *)1, pgrp); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } } @@ -715,8 +727,8 @@ static void exit_notify(struct task_struct *tsk) (t->signal->session == tsk->signal->session) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { - __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); - __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); + __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); + __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); } /* Let father know we died @@ -771,10 +783,6 @@ static void exit_notify(struct task_struct *tsk) /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -827,10 +835,16 @@ fastcall NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - update_mem_hiwater(tsk); + if (tsk->mm) { + update_hiwater_rss(tsk->mm); + update_hiwater_vm(tsk->mm); + } group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) + if (group_dead) { + del_timer_sync(&tsk->signal->real_timer); + exit_itimers(tsk->signal); acct_process(code); + } exit_mm(tsk); exit_sem(tsk); @@ -855,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif - BUG_ON(!(current->flags & PF_DEAD)); + /* PF_DEAD causes final put_task_struct after we schedule. */ + preempt_disable(); + BUG_ON(tsk->flags & PF_DEAD); + tsk->flags |= PF_DEAD; + schedule(); BUG(); /* Avoid "noreturn function does return". */ @@ -1189,7 +1207,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, exit_code = p->exit_code; if (unlikely(!exit_code) || - unlikely(p->state > TASK_STOPPED)) + unlikely(p->state & TASK_TRACED)) goto bail_ref; return wait_noreap_copyout(p, pid, uid, why, (exit_code << 8) | 0x7f, @@ -1365,6 +1383,15 @@ repeat: switch (p->state) { case TASK_TRACED: + /* + * When we hit the race with PTRACE_ATTACH, + * we will not report this child. But the + * race means it has not yet been moved to + * our ptrace_children list, so we need to + * set the flag here to avoid a spurious ECHILD + * when the race happens with the only child. + */ + flag = 1; if (!my_ptrace_child(p)) continue; /*FALLTHROUGH*/ diff --git a/kernel/fork.c b/kernel/fork.c index b65187f0c74e..8a069612eac3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -35,6 +35,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> @@ -176,41 +177,42 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); return tsk; } #ifdef CONFIG_MMU -static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) +static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct * mpnt, *tmp, **pprev; + struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(current->mm); + flush_cache_mm(oldmm); + down_write(&mm->mmap_sem); + mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; - set_mm_counter(mm, rss, 0); - set_mm_counter(mm, anon_rss, 0); cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; - for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; - __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } @@ -251,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) } /* - * Link in the new vma and copy the page table entries: - * link in first so that swapoff can see swap entries. - * Note that, exceptionally, here the vma is inserted - * without holding mm->mmap_sem. + * Link in the new vma and copy the page table entries. */ - spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; @@ -265,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, current->mm, tmp); - spin_unlock(&mm->page_table_lock); + retval = copy_page_range(mm, oldmm, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -275,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) goto out; } retval = 0; - out: - flush_tlb_mm(current->mm); + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: @@ -321,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->nr_ptes = 0; + set_mm_counter(mm, file_rss, 0); + set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; @@ -497,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (retval) goto free_pt; - mm->hiwater_rss = get_mm_counter(mm,rss); + mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; good_mm: @@ -564,24 +563,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -static int count_open_files(struct files_struct *files, int size) +static int count_open_files(struct fdtable *fdt) { + int size = fdt->max_fdset; int i; /* Find the last open fd */ for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (fdt->open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); return i; } +static struct files_struct *alloc_files(void) +{ + struct files_struct *newf; + struct fdtable *fdt; + + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + fdt = &newf->fdtab; + fdt->next_fd = 0; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->max_fdset = __FD_SETSIZE; + fdt->close_on_exec = &newf->close_on_exec_init; + fdt->open_fds = &newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->free_files = NULL; + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); +out: + return newf; +} + static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, size, i, error = 0, expand; + struct fdtable *old_fdt, *new_fdt; /* * A background process may not have any files ... @@ -602,35 +630,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) */ tsk->files = NULL; error = -ENOMEM; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - if (!newf) + newf = alloc_files(); + if (!newf) goto out; - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - newf->max_fds = NR_OPEN_DEFAULT; - newf->max_fdset = __FD_SETSIZE; - newf->close_on_exec = &newf->close_on_exec_init; - newf->open_fds = &newf->open_fds_init; - newf->fd = &newf->fd_array[0]; - spin_lock(&oldf->file_lock); - - open_files = count_open_files(oldf, oldf->max_fdset); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + size = old_fdt->max_fdset; + open_files = count_open_files(old_fdt); expand = 0; /* * Check whether we need to allocate a larger fd array or fd set. * Note: we're not a clone task, so the open count won't change. */ - if (open_files > newf->max_fdset) { - newf->max_fdset = 0; + if (open_files > new_fdt->max_fdset) { + new_fdt->max_fdset = 0; expand = 1; } - if (open_files > newf->max_fds) { - newf->max_fds = 0; + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; expand = 1; } @@ -642,14 +662,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) spin_unlock(&newf->file_lock); if (error < 0) goto out_release; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); } - old_fds = oldf->fd; - new_fds = newf->fd; + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; - memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); - memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -662,24 +689,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * is partway through open(). So make sure that this * fd is available to the new process. */ - FD_CLR(open_files - i, newf->open_fds); + FD_CLR(open_files - i, new_fdt->open_fds); } - *new_fds++ = f; + rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ - size = (newf->max_fds - open_files) * sizeof(struct file *); + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (newf->max_fdset > open_files) { - int left = (newf->max_fdset-open_files)/8; + if (new_fdt->max_fdset > open_files) { + int left = (new_fdt->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); - memset(&newf->open_fds->fds_bits[start], 0, left); - memset(&newf->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; @@ -688,9 +715,9 @@ out: return error; out_release: - free_fdset (newf->close_on_exec, newf->max_fdset); - free_fdset (newf->open_fds, newf->max_fdset); - free_fd_array(newf->fd, newf->max_fds); + free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); goto out; } @@ -818,7 +845,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); new_flags |= PF_FORKNOEXEC; if (!(clone_flags & CLONE_PTRACE)) p->ptrace = 0; @@ -994,6 +1021,9 @@ static task_t *copy_process(unsigned long clone_flags, * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ @@ -1029,7 +1059,8 @@ static task_t *copy_process(unsigned long clone_flags, * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || + !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); /* @@ -1112,6 +1143,9 @@ static task_t *copy_process(unsigned long clone_flags, __get_cpu_var(process_counts)++; } + if (!current->signal->tty && p->signal->tty) + p->signal->tty = NULL; + nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); diff --git a/kernel/futex.c b/kernel/futex.c index c7130f86106c..3b4d5ad44cc6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -40,6 +40,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <asm/futex.h> #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -204,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * Do a quick atomic lookup first - this is the fastpath. */ - spin_lock(¤t->mm->page_table_lock); - page = follow_page(mm, uaddr, 0); + page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); if (likely(page != NULL)) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - spin_unlock(¤t->mm->page_table_lock); + put_page(page); return 0; } - spin_unlock(¤t->mm->page_table_lock); /* * Do it the general way. @@ -327,6 +326,118 @@ out: } /* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +{ + union futex_key key1, key2; + struct futex_hash_bucket *bh1, *bh2; + struct list_head *head; + struct futex_q *this, *next; + int ret, op_ret, attempt = 0; + +retryfull: + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr1, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, &key2); + if (unlikely(ret != 0)) + goto out; + + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); + +retry: + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); + + op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + if (unlikely(op_ret < 0)) { + int dummy; + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); + + /* futex_atomic_op_inuser needs to both read and write + * *(int __user *)uaddr2, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. */ + if (attempt++) { + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + ret = -EFAULT; + if (attempt >= 2 || + !(vma = find_vma(mm, uaddr2)) || + vma->vm_start > uaddr2 || + !(vma->vm_flags & VM_WRITE)) + goto out; + + switch (handle_mm_fault(mm, vma, uaddr2, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + goto out; + } + goto retry; + } + + /* If we would have faulted, release mmap_sem, + * fault it in and start all over again. */ + up_read(¤t->mm->mmap_sem); + + ret = get_user(dummy, (int __user *)uaddr2); + if (ret) + return ret; + + goto retryfull; + } + + head = &bh1->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + head = &bh2->chain; + + op_ret = 0; + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { + wake_futex(this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +/* * Requeue all waiters hashed on one physical page to another * physical page. */ @@ -673,23 +784,17 @@ static int futex_fd(unsigned long uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - int err; err = f_setown(filp, current->pid, 1); if (err < 0) { - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; + goto error; } filp->f_owner.signum = signal; } q = kmalloc(sizeof(*q), GFP_KERNEL); if (!q) { - put_unused_fd(ret); - put_filp(filp); - ret = -ENOMEM; - goto out; + err = -ENOMEM; + goto error; } down_read(¤t->mm->mmap_sem); @@ -697,10 +802,8 @@ static int futex_fd(unsigned long uaddr, int signal) if (unlikely(err != 0)) { up_read(¤t->mm->mmap_sem); - put_unused_fd(ret); - put_filp(filp); kfree(q); - return err; + goto error; } /* @@ -716,6 +819,11 @@ static int futex_fd(unsigned long uaddr, int signal) fd_install(ret, filp); out: return ret; +error: + put_unused_fd(ret); + put_filp(filp); + ret = err; + goto out; } long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, @@ -740,6 +848,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, case FUTEX_CMP_REQUEUE: ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); break; + case FUTEX_WAKE_OP: + ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + break; default: ret = -ENOSYS; } diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 388977f3e9b7..0cbe633420fb 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void struct list_head *tmp; struct inter_module_entry *ime, *ime_new; - if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { + if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { /* Overloaded kernel, not fatal */ printk(KERN_ERR "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", @@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void kmalloc_failed = 1; return; } - memset(ime_new, 0, sizeof(*ime_new)); ime_new->im_name = im_name; ime_new->owner = owner; ime_new->userdata = userdata; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c29f83c16497..51df337b37db 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,20 +111,22 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) unsigned int status; kstat_this_cpu.irqs[irq]++; - if (desc->status & IRQ_PER_CPU) { + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; /* * No locking required for CPU-local interrupts: */ - desc->handler->ack(irq); + if (desc->handler->ack) + desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); desc->handler->end(irq); return 1; } spin_lock(&desc->lock); - desc->handler->ack(irq); + if (desc->handler->ack) + desc->handler->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac6700985705..1cfdb08ddf20 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,6 +18,10 @@ cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; +#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) +cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 85d08daa6600..f26e534c6585 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; */ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; -void __attribute__((weak)) -proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +#ifdef CONFIG_GENERIC_PENDING_IRQ +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +{ + /* + * Save these away for later use. Re-progam when the + * interrupt is pending + */ + set_pending_irq(irq, mask_val); +} +#else +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } +#endif static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 13bcec151b57..39277dd6bf90 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -18,6 +18,7 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/proc_fs.h> +#include <linux/sched.h> /* for cond_resched */ #include <linux/mm.h> #include <asm/sections.h> diff --git a/kernel/kexec.c b/kernel/kexec.c index cdd4dcd8fb63..2c95848fbce8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p) static int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); static struct page *kimage_alloc_page(struct kimage *image, - unsigned int gfp_mask, + gfp_t gfp_mask, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, @@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image, return 0; } -static struct page *kimage_alloc_pages(unsigned int gfp_mask, - unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; @@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask, if (pages) { unsigned int count, i; pages->mapping = NULL; - pages->private = order; + set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); @@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page) { unsigned int order, count, i; - order = page->private; + order = page_private(page); count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); @@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image, } static struct page *kimage_alloc_page(struct kimage *image, - unsigned int gfp_mask, + gfp_t gfp_mask, unsigned long destination) { /* diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 179baafcdd96..64ab045c3d9d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -36,7 +36,7 @@ * struct kfifo with kfree(). */ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - unsigned int __nocast gfp_mask, spinlock_t *lock) + gfp_t gfp_mask, spinlock_t *lock) { struct kfifo *fifo; @@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init); * * The size will be rounded-up to a power of 2. */ -struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) +struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) { unsigned char *buffer; struct kfifo *ret; diff --git a/kernel/kmod.c b/kernel/kmod.c index 44166e3bb8af..51a892063aaa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -131,14 +131,14 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - struct key *old_session; + struct key *new_session, *old_session; int retval; /* Unblock all signals and set the session keyring. */ - key_get(sub_info->ring); + new_session = key_get(sub_info->ring); flush_signals(current); spin_lock_irq(¤t->sighand->siglock); - old_session = __install_session_keyring(current, sub_info->ring); + old_session = __install_session_keyring(current, new_session); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b0237122b24e..ce4915dd683a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,8 +35,10 @@ #include <linux/spinlock.h> #include <linux/hash.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> #include <asm/kdebug.h> @@ -72,7 +74,7 @@ static struct hlist_head kprobe_insn_pages; * get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t *get_insn_slot(void) +kprobe_opcode_t __kprobes *get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -117,7 +119,7 @@ kprobe_opcode_t *get_insn_slot(void) return kip->insns; } -void free_insn_slot(kprobe_opcode_t *slot) +void __kprobes free_insn_slot(kprobe_opcode_t *slot) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -152,20 +154,42 @@ void free_insn_slot(kprobe_opcode_t *slot) } /* Locks kprobe: irqs must be disabled */ -void lock_kprobes(void) +void __kprobes lock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we take the kprobe_lock + * and before we get a chance to update kprobe_cpu, this to prevent + * deadlock when we have a kprobe on ISR routine and a kprobe on task + * routine + */ + local_irq_save(flags); + spin_lock(&kprobe_lock); kprobe_cpu = smp_processor_id(); + + local_irq_restore(flags); } -void unlock_kprobes(void) +void __kprobes unlock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we update + * kprobe_cpu and before we get a a chance to release kprobe_lock, + * this to prevent deadlock when we have a kprobe on ISR routine and + * a kprobe on task routine + */ + local_irq_save(flags); + kprobe_cpu = NR_CPUS; spin_unlock(&kprobe_lock); + + local_irq_restore(flags); } /* You have to be holding the kprobe_lock */ -struct kprobe *get_kprobe(void *addr) +struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; struct hlist_node *node; @@ -183,7 +207,7 @@ struct kprobe *get_kprobe(void *addr) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -198,8 +222,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -213,8 +237,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, return; } -static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { /* * if we faulted "during" the execution of a user specified @@ -227,7 +251,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, return 0; } -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp = curr_kprobe; if (curr_kprobe && kp->break_handler) { @@ -240,7 +264,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) +struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -249,7 +273,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) return NULL; } -static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) +static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe + *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -258,7 +283,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) return NULL; } -void add_rp_inst(struct kretprobe_instance *ri) +void __kprobes add_rp_inst(struct kretprobe_instance *ri) { /* * Remove rp inst off the free list - @@ -276,7 +301,7 @@ void add_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->used_instances); } -void recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -291,7 +316,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri) kfree(ri); } -struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) +struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) { return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; } @@ -302,7 +327,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) * instances associated with this task. These left over instances represent * probed functions that have been called but will never return. */ -void kprobe_flush_task(struct task_struct *tk) +void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head; @@ -322,7 +347,8 @@ void kprobe_flush_task(struct task_struct *tk) * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); @@ -353,7 +379,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) * Add the new probe to old_p->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { struct kprobe *kp; @@ -395,7 +421,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * the intricacies * TODO: Move kcalloc outside the spinlock */ -static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes register_aggr_kprobe(struct kprobe *old_p, + struct kprobe *p) { int ret = 0; struct kprobe *ap; @@ -434,15 +461,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p, spin_unlock_irqrestore(&kprobe_lock, flags); } -int register_kprobe(struct kprobe *p) +static int __kprobes in_kprobes_functions(unsigned long addr) +{ + if (addr >= (unsigned long)__kprobes_text_start + && addr < (unsigned long)__kprobes_text_end) + return -EINVAL; + return 0; +} + +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; unsigned long flags = 0; struct kprobe *old_p; - if ((ret = arch_prepare_kprobe(p)) != 0) { + if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) + return ret; + if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; - } + spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); p->nmissed = 0; @@ -466,7 +503,7 @@ rm_kprobe: return ret; } -void unregister_kprobe(struct kprobe *p) +void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; @@ -487,7 +524,7 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to notified first */ }; -int register_jprobe(struct jprobe *jp) +int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; @@ -496,14 +533,14 @@ int register_jprobe(struct jprobe *jp) return register_kprobe(&jp->kp); } -void unregister_jprobe(struct jprobe *jp) +void __kprobes unregister_jprobe(struct jprobe *jp) { unregister_kprobe(&jp->kp); } #ifdef ARCH_SUPPORTS_KRETPROBES -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -540,14 +577,14 @@ int register_kretprobe(struct kretprobe *rp) #else /* ARCH_SUPPORTS_KRETPROBES */ -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } #endif /* ARCH_SUPPORTS_KRETPROBES */ -void unregister_kretprobe(struct kretprobe *rp) +void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; diff --git a/kernel/kthread.c b/kernel/kthread.c index f50f174e92da..e75950a1092c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind); int kthread_stop(struct task_struct *k) { + return kthread_stop_sem(k, NULL); +} +EXPORT_SYMBOL(kthread_stop); + +int kthread_stop_sem(struct task_struct *k, struct semaphore *s) +{ int ret; down(&kthread_stop_lock); @@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - wake_up_process(k); + if (s) + up(s); + else + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k) return ret; } -EXPORT_SYMBOL(kthread_stop); +EXPORT_SYMBOL(kthread_stop_sem); static __init int helper_init(void) { diff --git a/kernel/module.c b/kernel/module.c index c32995fbd8fd..ff5c500ab625 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> @@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); return ret; } #else @@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs, if (!(tainted & TAINT_FORCED_MODULE)) { printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); } return 1; } @@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license) if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); - tainted |= TAINT_PROPRIETARY_MODULE; + add_taint(TAINT_PROPRIETARY_MODULE); } } @@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod, long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ struct exception_table_entry *extable; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); @@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod, (mod->num_gpl_syms && !gplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); } #endif @@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod, if (err < 0) goto cleanup; + /* flush the icache in correct context */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Flush the instruction cache, since we've played with text. + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ + if (mod->module_init) + flush_icache_range((unsigned long)mod->module_init, + (unsigned long)mod->module_init + + mod->init_size); + flush_icache_range((unsigned long)mod->module_core, + (unsigned long)mod->module_core + mod->core_size); + + set_fs(old_fs); + mod->args = args; if (obsparmindex) { err = obsolete_params(mod->name, mod->args, @@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; - mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } - /* flush the icache in correct context */ - set_fs(KERNEL_DS); - - /* Flush the instruction cache, since we've played with text */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); - - set_fs(old_fs); - /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/params.c b/kernel/params.c index d586c35ef8fc..47ba69547945 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/err.h> +#include <linux/slab.h> #if 0 #define DEBUGP printk @@ -80,8 +81,6 @@ static char *next_arg(char *args, char **param, char **val) int in_quote = 0, quoted = 0; char *next; - /* Chew any extra spaces */ - while (*args == ' ') args++; if (*args == '"') { args++; in_quote = 1; @@ -121,6 +120,10 @@ static char *next_arg(char *args, char **param, char **val) next = args + i + 1; } else next = args + i; + + /* Chew up trailing spaces. */ + while (*next == ' ') + next++; return next; } @@ -135,6 +138,10 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); + /* Chew leading spaces */ + while (*args == ' ') + args++; + while (*args) { int ret; @@ -542,8 +549,8 @@ static void __init kernel_param_sysfs_setup(const char *name, { struct module_kobject *mk; - mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); - memset(mk, 0, sizeof(struct module_kobject)); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index ad85d3f0dcc4..91a894264941 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ -static inline void bump_cpu_timer(struct k_itimer *timer, +static void bump_cpu_timer(struct k_itimer *timer, union cpu_time_count now) { int i; @@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, for (i = 0; incr < delta - incr; i++) incr = incr << 1; for (; i >= 0; incr >>= 1, i--) { - if (delta <= incr) + if (delta < incr) continue; timer->it.cpu.expires.sched += incr; timer->it_overrun += 1 << i; @@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) incr = cputime_add(incr, incr); for (; i >= 0; incr = cputime_halve(incr), i--) { - if (cputime_le(delta, incr)) + if (cputime_lt(delta, incr)) continue; timer->it.cpu.expires.cpu = cputime_add(timer->it.cpu.expires.cpu, incr); @@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; + int ret = 0; - if (timer->it.cpu.firing) - return TIMER_RETRY; - - if (unlikely(p == NULL)) - return 0; - - if (!list_empty(&timer->it.cpu.entry)) { + if (likely(p != NULL)) { read_lock(&tasklist_lock); if (unlikely(p->signal == NULL)) { /* @@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer) */ BUG_ON(!list_empty(&timer->it.cpu.entry)); } else { - /* - * Take us off the task's timer list. - */ spin_lock(&p->sighand->siglock); - list_del(&timer->it.cpu.entry); + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); spin_unlock(&p->sighand->siglock); } read_unlock(&tasklist_lock); + + if (!ret) + put_task_struct(p); } - put_task_struct(p); - return 0; + return ret; } /* @@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head, cputime_t ptime = cputime_add(utime, stime); list_for_each_entry_safe(timer, next, head, entry) { - timer->task = NULL; list_del_init(&timer->entry); if (cputime_lt(timer->expires.cpu, ptime)) { timer->expires.cpu = cputime_zero; @@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head, ++head; list_for_each_entry_safe(timer, next, head, entry) { - timer->task = NULL; list_del_init(&timer->entry); if (cputime_lt(timer->expires.cpu, utime)) { timer->expires.cpu = cputime_zero; @@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head, ++head; list_for_each_entry_safe(timer, next, head, entry) { - timer->task = NULL; list_del_init(&timer->entry); if (timer->expires.sched < sched_time) { timer->expires.sched = 0; @@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p, struct task_struct *t = p; unsigned int nthreads = atomic_read(&p->signal->live); + if (!nthreads) + return; + switch (clock_idx) { default: BUG(); @@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p, left = cputime_div(cputime_sub(expires.cpu, val.cpu), nthreads); do { - if (!unlikely(t->exit_state)) { + if (!unlikely(t->flags & PF_EXITING)) { ticks = cputime_add(prof_ticks(t), left); if (cputime_eq(t->it_prof_expires, cputime_zero) || @@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p, left = cputime_div(cputime_sub(expires.cpu, val.cpu), nthreads); do { - if (!unlikely(t->exit_state)) { + if (!unlikely(t->flags & PF_EXITING)) { ticks = cputime_add(virt_ticks(t), left); if (cputime_eq(t->it_virt_expires, cputime_zero) || @@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p, nsleft = expires.sched - val.sched; do_div(nsleft, nthreads); do { - if (!unlikely(t->exit_state)) { + if (!unlikely(t->flags & PF_EXITING)) { ns = t->sched_time + nsleft; if (t->it_sched_expires == 0 || t->it_sched_expires > ns) { @@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) struct cpu_timer_list *next; unsigned long i; + if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) + return; + head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); @@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { - if (next->expires.sched > nt->expires.sched) { - listpos = &next->entry; + if (next->expires.sched > nt->expires.sched) break; - } + listpos = &next->entry; } } else { list_for_each_entry(next, head, entry) { - if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { - listpos = &next->entry; + if (cputime_gt(next->expires.cpu, nt->expires.cpu)) break; - } + listpos = &next->entry; } } list_add(&nt->entry, listpos); @@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, * Disarm any old timer after extracting its expiry time. */ BUG_ON(!irqs_disabled()); + + ret = 0; spin_lock(&p->sighand->siglock); old_expires = timer->it.cpu.expires; - list_del_init(&timer->it.cpu.entry); + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else + list_del_init(&timer->it.cpu.entry); spin_unlock(&p->sighand->siglock); /* @@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, } } - if (unlikely(timer->it.cpu.firing)) { + if (unlikely(ret)) { /* * We are colliding with the timer actually firing. * Punt after filling in the timer's old value, and @@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, * it as an overrun (thanks to bump_cpu_timer above). */ read_unlock(&tasklist_lock); - timer->it.cpu.firing = -1; - ret = TIMER_RETRY; goto out; } @@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { + int maxfire; struct list_head *timers = tsk->cpu_timers; + maxfire = 20; tsk->it_prof_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { tsk->it_prof_expires = t->expires.cpu; break; } @@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; tsk->it_virt_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { tsk->it_virt_expires = t->expires.cpu; break; } @@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; tsk->it_sched_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (tsk->sched_time < t->expires.sched) { + if (!--maxfire || tsk->sched_time < t->expires.sched) { tsk->it_sched_expires = t->expires.sched; break; } @@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { + int maxfire; struct signal_struct *const sig = tsk->signal; cputime_t utime, stime, ptime, virt_expires, prof_expires; unsigned long long sched_time, sched_expires; @@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk, } while (t != tsk); ptime = cputime_add(utime, stime); + maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(ptime, t->expires.cpu)) { + if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { prof_expires = t->expires.cpu; break; } @@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (cputime_lt(utime, t->expires.cpu)) { + if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { virt_expires = t->expires.cpu; break; } @@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk, } ++timers; + maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_entry(timers->next, struct cpu_timer_list, entry); - if (sched_time < t->expires.sched) { + if (!--maxfire || sched_time < t->expires.sched) { sched_expires = t->expires.sched; break; } @@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk, unsigned long long sched_left, sched; const unsigned int nthreads = atomic_read(&sig->live); + if (!nthreads) + return; + prof_left = cputime_sub(prof_expires, utime); prof_left = cputime_sub(prof_left, stime); prof_left = cputime_div(prof_left, nthreads); @@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk, do { t = next_thread(t); - } while (unlikely(t->exit_state)); + } while (unlikely(t->flags & PF_EXITING)); } while (t != tsk); } } @@ -1212,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) /* * The task was cleaned up already, no future firings. */ - return; + goto out; /* * Fetch the current sample and update the timer's expiry time. @@ -1222,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) { clear_dead_task(timer, now); - return; + goto out; } read_lock(&tasklist_lock); /* arm_timer needs it. */ } else { @@ -1235,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) put_task_struct(p); timer->it.cpu.task = p = NULL; timer->it.cpu.expires.sched = 0; - read_unlock(&tasklist_lock); - return; + goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* * We've noticed that the thread is dead, but @@ -1244,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * drop our task ref. */ clear_dead_task(timer, now); - read_unlock(&tasklist_lock); - return; + goto out_unlock; } cpu_clock_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); @@ -1257,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ arm_timer(timer, now); +out_unlock: read_unlock(&tasklist_lock); + +out: + timer->it_overrun_last = timer->it_overrun; + timer->it_overrun = -1; + ++timer->it_requeue_pending; } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 10b2ad749d14..ea55c7a1cd75 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private) timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - if (unlikely(timr->it_process->flags & PF_EXITING)) { - timr->it_sigev_notify = SIGEV_SIGNAL; - put_task_struct(timr->it_process); - timr->it_process = timr->it_process->group_leader; - goto group; - } - return send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); - } - else { - group: - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + struct task_struct *leader; + int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + + if (likely(ret >= 0)) + return ret; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; + put_task_struct(timr->it_process); + timr->it_process = leader; } + + return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -1155,7 +1157,7 @@ retry_delete: } /* - * This is called by __exit_signal, only when there are no more + * This is called by do_exit or de_thread, only when there are no more * references to the shared signal_struct. */ void exit_itimers(struct signal_struct *sig) @@ -1166,7 +1168,6 @@ void exit_itimers(struct signal_struct *sig) tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); } - del_timer_sync(&sig->real_timer); } /* @@ -1294,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) return error; } -static void nanosleep_wake_up(unsigned long __data) -{ - struct task_struct *p = (struct task_struct *) __data; - - wake_up_process(p); -} - /* * The standard says that an absolute nanosleep call MUST wake up at * the requested time in spite of clock settings. Here is what we do: @@ -1441,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock, int flags, struct timespec *tsave) { struct timespec t, dum; - struct timer_list new_timer; DECLARE_WAITQUEUE(abs_wqueue, current); u64 rq_time = (u64)0; s64 left; @@ -1450,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock, ¤t_thread_info()->restart_block; abs_wqueue.flags = 0; - init_timer(&new_timer); - new_timer.expires = 0; - new_timer.data = (unsigned long) current; - new_timer.function = nanosleep_wake_up; abs = flags & TIMER_ABSTIME; if (restart_block->fn == clock_nanosleep_restart) { @@ -1489,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock, if (left < (s64)0) break; - new_timer.expires = jiffies + left; - __set_current_state(TASK_INTERRUPTIBLE); - add_timer(&new_timer); - - schedule(); + schedule_timeout_interruptible(left); - del_timer_sync(&new_timer); left = rq_time - get_jiffies_64(); } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2c7121d9bff1..46a5e5acff97 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,5 +1,6 @@ config PM bool "Power Management support" + depends on !IA64_HP_SIM ---help--- "Power Management" means that parts of your computer are shut off or put into a power conserving "sleep" mode if they are not @@ -28,7 +29,7 @@ config PM_DEBUG config SOFTWARE_SUSPEND bool "Software Suspend" - depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) + depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. @@ -72,6 +73,18 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config SWSUSP_ENCRYPT + bool "Encrypt suspend image" + depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) + default "" + ---help--- + To prevent data gathering from swap after resume you can encrypt + the suspend image with a temporary key that is deleted on + resume. + + Note that the temporary key is stored unencrypted on disk while the + system is suspended. + config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 2f438d0eaa13..c71eb4579c07 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -4,7 +4,7 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := main.o process.o console.o pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o obj-$(CONFIG_SUSPEND_SMP) += smp.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 664eb0469b6e..027322a564f4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -17,12 +17,12 @@ #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> +#include <linux/pm.h> #include "power.h" extern suspend_disk_method_t pm_disk_mode; -extern struct pm_ops * pm_ops; extern int swsusp_suspend(void); extern int swsusp_write(void); @@ -30,7 +30,6 @@ extern int swsusp_check(void); extern int swsusp_read(void); extern void swsusp_close(void); extern int swsusp_resume(void); -extern int swsusp_free(void); static int noresume = 0; @@ -49,13 +48,11 @@ dev_t swsusp_resume_device; static void power_down(suspend_disk_method_t mode) { - unsigned long flags; int error = 0; - local_irq_save(flags); switch(mode) { case PM_DISK_PLATFORM: - device_shutdown(); + kernel_power_off_prepare(); error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: @@ -95,10 +92,7 @@ static void free_some_memory(void) printk("Freeing memory... "); while ((tmp = shrink_all_memory(10000))) { pages += tmp; - printk("\b%c", p[i]); - i++; - if (i > 3) - i = 0; + printk("\b%c", p[i++ % 4]); } printk("\bdone (%li pages freed)\n", pages); } @@ -112,24 +106,12 @@ static inline void platform_finish(void) } } -static void finish(void) -{ - device_resume(); - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - - static int prepare_processes(void) { int error; pm_prepare_console(); - sys_sync(); - disable_nonboot_cpus(); if (freeze_processes()) { @@ -162,15 +144,6 @@ static void unprepare_processes(void) pm_restore_console(); } -static int prepare_devices(void) -{ - int error; - - if ((error = device_suspend(PMSG_FREEZE))) - printk("Some devices failed to suspend\n"); - return error; -} - /** * pm_suspend_disk - The granpappy of power management. * @@ -187,32 +160,37 @@ int pm_suspend_disk(void) error = prepare_processes(); if (error) return error; - error = prepare_devices(); + error = device_suspend(PMSG_FREEZE); if (error) { + printk("Some devices failed to suspend\n"); unprepare_processes(); return error; } - pr_debug("PM: Attempting to suspend to disk.\n"); - if (pm_disk_mode == PM_DISK_FIRMWARE) - return pm_ops->enter(PM_SUSPEND_DISK); - pr_debug("PM: snapshotting memory.\n"); in_suspend = 1; if ((error = swsusp_suspend())) goto Done; if (in_suspend) { + device_resume(); pr_debug("PM: writing image.\n"); error = swsusp_write(); if (!error) power_down(pm_disk_mode); + else { + swsusp_free(); + unprepare_processes(); + return error; + } } else pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); Done: - finish(); + device_resume(); + unprepare_processes(); return error; } @@ -233,9 +211,12 @@ static int software_resume(void) { int error; + down(&pm_sem); if (!swsusp_resume_device) { - if (!strlen(resume_file)) + if (!strlen(resume_file)) { + up(&pm_sem); return -ENOENT; + } swsusp_resume_device = name_to_dev_t(resume_file); pr_debug("swsusp: Resume From Partition %s\n", resume_file); } else { @@ -248,6 +229,7 @@ static int software_resume(void) * FIXME: If noresume is specified, we need to find the partition * and reset it back to normal swap space. */ + up(&pm_sem); return 0; } @@ -265,25 +247,30 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) - goto Cleanup; + if ((error = swsusp_read())) { + swsusp_free(); + goto Thaw; + } pr_debug("PM: Preparing devices for restore.\n"); - if ((error = prepare_devices())) - goto Free; + if ((error = device_suspend(PMSG_FREEZE))) { + printk("Some devices failed to suspend\n"); + swsusp_free(); + goto Thaw; + } mb(); pr_debug("PM: Restoring saved image.\n"); swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); - finish(); - Free: - swsusp_free(); - Cleanup: + device_resume(); + Thaw: unprepare_processes(); Done: + /* For success case, the suspend path will release the lock */ + up(&pm_sem); pr_debug("PM: Resume from disk failed.\n"); return 0; } @@ -390,7 +377,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t if (sscanf(buf, "%u:%u", &maj, &min) == 2) { res = MKDEV(maj,min); if (maj == MAJOR(res) && min == MINOR(res)) { + down(&pm_sem); swsusp_resume_device = res; + up(&pm_sem); printk("Attempting manual resume\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 71aa0fd22007..18d7d693fbba 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state) -static char * pm_states[] = { +static char *pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", +#ifdef CONFIG_SOFTWARE_SUSPEND [PM_SUSPEND_DISK] = "disk", - NULL, +#endif }; @@ -166,6 +167,8 @@ static int enter_state(suspend_state_t state) { int error; + if (pm_ops->valid && !pm_ops->valid(state)) + return -ENODEV; if (down_trylock(&pm_sem)) return -EBUSY; @@ -235,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) char * s = buf; for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i]) + if (pm_states[i] && pm_ops && (!pm_ops->valid + ||(pm_ops->valid && pm_ops->valid(i)))) s += sprintf(s,"%s ",pm_states[i]); } s += sprintf(s,"\n"); diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 61deda04e39e..159149321b3c 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type, unsigned long id, pm_callback callback) { - struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); + struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); if (dev) { - memset(dev, 0, sizeof(*dev)); dev->type = type; dev->id = id; dev->callback = callback; diff --git a/kernel/power/power.h b/kernel/power/power.h index cd6a3493cc0d..d4fd96a135ab 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,7 +1,7 @@ #include <linux/suspend.h> #include <linux/utsname.h> -/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but +/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but we probably do not take enough locks for switching consoles, etc, so bad things might happen. */ @@ -9,6 +9,9 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) #endif +#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ + - 4 - 3*sizeof(unsigned long) - sizeof(int) \ + - sizeof(void *)) / sizeof(swp_entry_t)) struct swsusp_info { struct new_utsname uts; @@ -18,7 +21,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pagedir_pages; suspend_pagedir_t * suspend_pagedir; - swp_entry_t pagedir[768]; + swp_entry_t pagedir[MAX_PBES]; } __attribute__((aligned(PAGE_SIZE))); @@ -50,3 +53,20 @@ extern void thaw_processes(void); extern int pm_prepare_console(void); extern void pm_restore_console(void); + + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +extern unsigned int nr_copy_pages; +extern suspend_pagedir_t *pagedir_nosave; +extern suspend_pagedir_t *pagedir_save; + +extern asmlinkage int swsusp_arch_suspend(void); +extern asmlinkage int swsusp_arch_resume(void); + +extern int restore_highmem(void); +extern struct pbe * alloc_pagedir(unsigned nr_pages); +extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); +extern void swsusp_free(void); +extern int enough_swap(unsigned nr_pages); diff --git a/kernel/power/process.c b/kernel/power/process.c index 3bd0d261818f..28de118f7a0b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -38,7 +38,6 @@ void refrigerator(void) processes around? */ long save; save = current->state; - current->state = TASK_UNINTERRUPTIBLE; pr_debug("%s entered refrigerator\n", current->comm); printk("="); @@ -47,8 +46,10 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - while (frozen(current)) + while (frozen(current)) { + current->state = TASK_UNINTERRUPTIBLE; schedule(); + } pr_debug("%s left refrigerator\n", current->comm); current->state = save; } @@ -80,13 +81,33 @@ int freeze_processes(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ - if (time_after(jiffies, start_time + TIMEOUT)) { + if (todo && time_after(jiffies, start_time + TIMEOUT)) { printk( "\n" ); printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); - return todo; + break; } } while(todo); + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ + if (todo) { + read_lock(&tasklist_lock); + do_each_thread(g, p) + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + p->flags &= ~PF_FREEZE; + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + while_each_thread(g, p); + read_unlock(&tasklist_lock); + return todo; + } + printk( "|\n" ); BUG_ON(in_atomic()); return 0; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 000000000000..42a628704398 --- /dev/null +++ b/kernel/power/snapshot.c @@ -0,0 +1,435 @@ +/* + * linux/kernel/power/snapshot.c + * + * This file provide system snapshot/restore functionality. + * + * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * + * This file is released under the GPLv2, and is based on swsusp.c. + * + */ + + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/suspend.h> +#include <linux/smp_lock.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/bootmem.h> +#include <linux/syscalls.h> +#include <linux/console.h> +#include <linux/highmem.h> + +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +#include "power.h" + +#ifdef CONFIG_HIGHMEM +struct highmem_page { + char *data; + struct page *page; + struct highmem_page *next; +}; + +static struct highmem_page *highmem_copy; + +static int save_highmem_zone(struct zone *zone) +{ + unsigned long zone_pfn; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + struct highmem_page *save; + void *kaddr; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. + */ + if (PageReserved(page)) { + printk("highmem reserved page?!\n"); + continue; + } + BUG_ON(PageNosave(page)); + if (PageNosaveFree(page)) + continue; + save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); + if (!save) + return -ENOMEM; + save->next = highmem_copy; + save->page = page; + save->data = (void *) get_zeroed_page(GFP_ATOMIC); + if (!save->data) { + kfree(save); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(save->data, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + highmem_copy = save; + } + return 0; +} + + +static int save_highmem(void) +{ + struct zone *zone; + int res = 0; + + pr_debug("swsusp: Saving Highmem\n"); + for_each_zone (zone) { + if (is_highmem(zone)) + res = save_highmem_zone(zone); + if (res) + return res; + } + return 0; +} + +int restore_highmem(void) +{ + printk("swsusp: Restoring Highmem\n"); + while (highmem_copy) { + struct highmem_page *save = highmem_copy; + void *kaddr; + highmem_copy = save->next; + + kaddr = kmap_atomic(save->page, KM_USER0); + memcpy(kaddr, save->data, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + free_page((long) save->data); + kfree(save); + } + return 0; +} +#else +static int save_highmem(void) { return 0; } +int restore_highmem(void) { return 0; } +#endif /* CONFIG_HIGHMEM */ + + +static int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +/** + * saveable - Determine whether a page should be cloned or not. + * @pfn: The page + * + * We save a page if it's Reserved, and not in the range of pages + * statically defined as 'unsaveable', or if it isn't reserved, and + * isn't part of a free chunk of pages. + */ + +static int saveable(struct zone *zone, unsigned long *zone_pfn) +{ + unsigned long pfn = *zone_pfn + zone->zone_start_pfn; + struct page *page; + + if (!pfn_valid(pfn)) + return 0; + + page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) + return 0; + if (PageReserved(page) && pfn_is_nosave(pfn)) { + pr_debug("[nosave pfn 0x%lx]", pfn); + return 0; + } + if (PageNosaveFree(page)) + return 0; + + return 1; +} + +static unsigned count_data_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned n; + + n = 0; + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + n += saveable(zone, &zone_pfn); + } + return n; +} + +static void copy_data_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *pbe, *p; + + pbe = pblist; + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + /* This is necessary for swsusp_free() */ + for_each_pb_page (p, pblist) + SetPageNosaveFree(virt_to_page(p)); + for_each_pbe (p, pblist) + SetPageNosaveFree(virt_to_page(p->address)); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + if (saveable(zone, &zone_pfn)) { + struct page *page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + BUG_ON(!pbe); + pbe->orig_address = (unsigned long)page_address(page); + /* copy_page is not usable for copying task structs. */ + memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + pbe = pbe->next; + } + } + } + BUG_ON(pbe); +} + + +/** + * free_pagedir - free pages allocated with alloc_pagedir() + */ + +static void free_pagedir(struct pbe *pblist) +{ + struct pbe *pbe; + + while (pblist) { + pbe = (pblist + PB_PAGE_SKIP)->next; + ClearPageNosave(virt_to_page(pblist)); + ClearPageNosaveFree(virt_to_page(pblist)); + free_page((unsigned long)pblist); + pblist = pbe; + } +} + +/** + * fill_pb_page - Create a list of PBEs on a given memory page + */ + +static inline void fill_pb_page(struct pbe *pbpage) +{ + struct pbe *p; + + p = pbpage; + pbpage += PB_PAGE_SKIP; + do + p->next = p + 1; + while (++p < pbpage); +} + +/** + * create_pbe_list - Create a list of PBEs on top of a given chain + * of memory pages allocated with alloc_pagedir() + */ + +void create_pbe_list(struct pbe *pblist, unsigned nr_pages) +{ + struct pbe *pbpage, *p; + unsigned num = PBES_PER_PAGE; + + for_each_pb_page (pbpage, pblist) { + if (num >= nr_pages) + break; + + fill_pb_page(pbpage); + num += PBES_PER_PAGE; + } + if (pbpage) { + for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) + p->next = p + 1; + p->next = NULL; + } + pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +static void *alloc_image_page(void) +{ + void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); + if (res) { + SetPageNosave(virt_to_page(res)); + SetPageNosaveFree(virt_to_page(res)); + } + return res; +} + +/** + * alloc_pagedir - Allocate the page directory. + * + * First, determine exactly how many pages we need and + * allocate them. + * + * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE + * struct pbe elements (pbes) and the last element in the page points + * to the next page. + * + * On each page we set up a list of struct_pbe elements. + */ + +struct pbe *alloc_pagedir(unsigned nr_pages) +{ + unsigned num; + struct pbe *pblist, *pbe; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); + pblist = alloc_image_page(); + /* FIXME: rewrite this ugly loop */ + for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; + pbe = pbe->next, num += PBES_PER_PAGE) { + pbe += PB_PAGE_SKIP; + pbe->next = alloc_image_page(); + } + if (!pbe) { /* get_zeroed_page() failed */ + free_pagedir(pblist); + pblist = NULL; + } + return pblist; +} + +/** + * Free pages we allocated for suspend. Suspend pages are alocated + * before atomic copy, so we need to free them after resume. + */ + +void swsusp_free(void) +{ + struct zone *zone; + unsigned long zone_pfn; + + for_each_zone(zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { + struct page * page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + if (PageNosave(page) && PageNosaveFree(page)) { + ClearPageNosave(page); + ClearPageNosaveFree(page); + free_page((long) page_address(page)); + } + } + } +} + + +/** + * enough_free_mem - Make sure we enough free memory to snapshot. + * + * Returns TRUE or FALSE after checking the number of available + * free pages. + */ + +static int enough_free_mem(unsigned nr_pages) +{ + pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); + return nr_free_pages() > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + + +static struct pbe *swsusp_alloc(unsigned nr_pages) +{ + struct pbe *pblist, *p; + + if (!(pblist = alloc_pagedir(nr_pages))) { + printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); + return NULL; + } + create_pbe_list(pblist, nr_pages); + + for_each_pbe (p, pblist) { + p->address = (unsigned long)alloc_image_page(); + if (!p->address) { + printk(KERN_ERR "suspend: Allocating image pages failed.\n"); + swsusp_free(); + return NULL; + } + } + + return pblist; +} + +asmlinkage int swsusp_save(void) +{ + unsigned nr_pages; + + pr_debug("swsusp: critical section: \n"); + if (save_highmem()) { + printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n"); + restore_highmem(); + return -ENOMEM; + } + + drain_local_pages(); + nr_pages = count_data_pages(); + printk("swsusp: Need to copy %u pages\n", nr_pages); + + pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", + nr_pages, + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, + PAGES_FOR_IO, nr_free_pages()); + + /* This is needed because of the fixed size of swsusp_info */ + if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) + return -ENOSPC; + + if (!enough_free_mem(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free memory\n"); + return -ENOMEM; + } + + if (!enough_swap(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free swap\n"); + return -ENOSPC; + } + + pagedir_nosave = swsusp_alloc(nr_pages); + if (!pagedir_nosave) + return -ENOMEM; + + /* During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(); + copy_data_pages(pagedir_nosave); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + + nr_copy_pages = nr_pages; + + printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + return 0; +} diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f2bc71b9fe8b..12db1d2ad61f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -1,11 +1,10 @@ /* * linux/kernel/power/swsusp.c * - * This file is to realize architecture-independent - * machine suspend feature using pretty near only high-level routines + * This file provides code to write suspend image to swap and read it back. * * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> * * This file is released under the GPLv2. * @@ -31,6 +30,9 @@ * Alex Badea <vampire@go.ro>: * Fixed runaway init * + * Andreas Steinmetz <ast@domdv.de>: + * Added encrypted suspend option + * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt @@ -44,11 +46,7 @@ #include <linux/utsname.h> #include <linux/version.h> #include <linux/delay.h> -#include <linux/reboot.h> #include <linux/bitops.h> -#include <linux/vt_kern.h> -#include <linux/kbd_kern.h> -#include <linux/keyboard.h> #include <linux/spinlock.h> #include <linux/genhd.h> #include <linux/kernel.h> @@ -60,10 +58,8 @@ #include <linux/swapops.h> #include <linux/bootmem.h> #include <linux/syscalls.h> -#include <linux/console.h> #include <linux/highmem.h> #include <linux/bio.h> -#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -71,18 +67,20 @@ #include <asm/tlbflush.h> #include <asm/io.h> -#include "power.h" +#include <linux/random.h> +#include <linux/crypto.h> +#include <asm/scatterlist.h> -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +#include "power.h" -/* Variables to be preserved over suspend */ -static int nr_copy_pages_check; +#define CIPHER "aes" +#define MAXKEY 32 +#define MAXIV 32 extern char resume_file[]; /* Local variables that should not be affected by save */ -static unsigned int nr_copy_pages __nosavedata = 0; +unsigned int nr_copy_pages __nosavedata = 0; /* Suspend pagedir is allocated before final copy, therefore it must be freed after resume @@ -98,12 +96,13 @@ static unsigned int nr_copy_pages __nosavedata = 0; MMU hardware. */ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; -static suspend_pagedir_t *pagedir_save; +suspend_pagedir_t *pagedir_save; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; + u8 key_iv[MAXKEY+MAXIV]; swp_entry_t swsusp_info; char orig_sig[10]; char sig[10]; @@ -112,12 +111,6 @@ static struct swsusp_header { static struct swsusp_info swsusp_info; /* - * XXX: We try to keep some more pages free so that I/O operations succeed - * without paging. Might this be more? - */ -#define PAGES_FOR_IO 512 - -/* * Saving part... */ @@ -129,6 +122,131 @@ static struct swsusp_info swsusp_info; static unsigned short swapfile_used[MAX_SWAPFILES]; static unsigned short root_swap; +static int write_page(unsigned long addr, swp_entry_t * loc); +static int bio_read_page(pgoff_t page_off, void * page); + +static u8 key_iv[MAXKEY+MAXIV]; + +#ifdef CONFIG_SWSUSP_ENCRYPT + +static int crypto_init(int mode, void **mem) +{ + int error = 0; + int len; + char *modemsg; + struct crypto_tfm *tfm; + + modemsg = mode ? "suspend not possible" : "resume not possible"; + + tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); + if(!tfm) { + printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); + error = -EINVAL; + goto out; + } + + if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { + printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); + error = -ENOKEY; + goto fail; + } + + if (mode) + get_random_bytes(key_iv, MAXKEY+MAXIV); + + len = crypto_tfm_alg_max_keysize(tfm); + if (len > MAXKEY) + len = MAXKEY; + + if (crypto_cipher_setkey(tfm, key_iv, len)) { + printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); + error = -EKEYREJECTED; + goto fail; + } + + len = crypto_tfm_alg_ivsize(tfm); + + if (MAXIV < len) { + printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); + error = -EOVERFLOW; + goto fail; + } + + crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); + + *mem=(void *)tfm; + + goto out; + +fail: crypto_free_tfm(tfm); +out: return error; +} + +static __inline__ void crypto_exit(void *mem) +{ + crypto_free_tfm((struct crypto_tfm *)mem); +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + src.page = virt_to_page(p->address); + src.offset = 0; + src.length = PAGE_SIZE; + dst.page = virt_to_page((void *)&swsusp_header); + dst.offset = 0; + dst.length = PAGE_SIZE; + + error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, + PAGE_SIZE); + + if (!error) + error = write_page((unsigned long)&swsusp_header, + &(p->swap_address)); + return error; +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); + if (!error) { + src.offset = 0; + src.length = PAGE_SIZE; + dst.offset = 0; + dst.length = PAGE_SIZE; + src.page = dst.page = virt_to_page((void *)p->address); + + error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, + &src, PAGE_SIZE); + } + return error; +} +#else +static __inline__ int crypto_init(int mode, void *mem) +{ + return 0; +} + +static __inline__ void crypto_exit(void *mem) +{ +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + return write_page(p->address, &(p->swap_address)); +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + return bio_read_page(swp_offset(p->swap_address), (void *)p->address); +} +#endif + static int mark_swapfiles(swp_entry_t prev) { int error; @@ -140,6 +258,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); swsusp_header.swsusp_info = prev; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), @@ -179,9 +298,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */ len=strlen(resume_file); root_swap = 0xFFFF; - swap_list_lock(); + spin_lock(&swap_lock); for (i=0; i<MAX_SWAPFILES; i++) { - if (swap_info[i].flags == 0) { + if (!(swap_info[i].flags & SWP_WRITEOK)) { swapfile_used[i]=SWAPFILE_UNUSED; } else { if (!len) { @@ -202,7 +321,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ } } } - swap_list_unlock(); + spin_unlock(&swap_lock); return (root_swap != 0xffff) ? 0 : -ENODEV; } @@ -216,16 +335,16 @@ static void lock_swapdevices(void) { int i; - swap_list_lock(); + spin_lock(&swap_lock); for (i = 0; i< MAX_SWAPFILES; i++) if (swapfile_used[i] == SWAPFILE_IGNORED) { - swap_info[i].flags ^= 0xFF; + swap_info[i].flags ^= SWP_WRITEOK; } - swap_list_unlock(); + spin_unlock(&swap_lock); } /** - * write_swap_page - Write one page to a fresh swap location. + * write_page - Write one page to a fresh swap location. * @addr: Address we're writing. * @loc: Place to store the entry we used. * @@ -264,15 +383,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc) static void data_free(void) { swp_entry_t entry; - int i; + struct pbe * p; - for (i = 0; i < nr_copy_pages; i++) { - entry = (pagedir_nosave + i)->swap_address; + for_each_pbe(p, pagedir_nosave) { + entry = p->swap_address; if (entry.val) swap_free(entry); else break; - (pagedir_nosave + i)->swap_address = (swp_entry_t){0}; } } @@ -286,6 +404,10 @@ static int data_write(void) int error = 0, i = 0; unsigned int mod = nr_copy_pages / 100; struct pbe *p; + void *tfm; + + if ((error = crypto_init(1, &tfm))) + return error; if (!mod) mod = 1; @@ -294,11 +416,14 @@ static int data_write(void) for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = write_page(p->address, &(p->swap_address)))) + if ((error = crypto_write(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -385,7 +510,6 @@ static int write_pagedir(void) * write_suspend_image - Write entire image and metadata. * */ - static int write_suspend_image(void) { int error; @@ -400,6 +524,7 @@ static int write_suspend_image(void) if ((error = close_swap())) goto FreePagedir; Done: + memset(key_iv, 0, MAXKEY+MAXIV); return error; FreePagedir: free_pagedir_entries(); @@ -408,354 +533,6 @@ static int write_suspend_image(void) goto Done; } - -#ifdef CONFIG_HIGHMEM -struct highmem_page { - char *data; - struct page *page; - struct highmem_page *next; -}; - -static struct highmem_page *highmem_copy; - -static int save_highmem_zone(struct zone *zone) -{ - unsigned long zone_pfn; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - struct page *page; - struct highmem_page *save; - void *kaddr; - unsigned long pfn = zone_pfn + zone->zone_start_pfn; - - if (!(pfn%1000)) - printk("."); - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. - */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); - continue; - } - BUG_ON(PageNosave(page)); - if (PageNosaveFree(page)) - continue; - save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); - if (!save) - return -ENOMEM; - save->next = highmem_copy; - save->page = page; - save->data = (void *) get_zeroed_page(GFP_ATOMIC); - if (!save->data) { - kfree(save); - return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); - memcpy(save->data, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - highmem_copy = save; - } - return 0; -} -#endif /* CONFIG_HIGHMEM */ - - -static int save_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int res = 0; - - pr_debug("swsusp: Saving Highmem\n"); - for_each_zone (zone) { - if (is_highmem(zone)) - res = save_highmem_zone(zone); - if (res) - return res; - } -#endif - return 0; -} - -static int restore_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - printk("swsusp: Restoring Highmem\n"); - while (highmem_copy) { - struct highmem_page *save = highmem_copy; - void *kaddr; - highmem_copy = save->next; - - kaddr = kmap_atomic(save->page, KM_USER0); - memcpy(kaddr, save->data, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - free_page((long) save->data); - kfree(save); - } -#endif - return 0; -} - - -static int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -/** - * saveable - Determine whether a page should be cloned or not. - * @pfn: The page - * - * We save a page if it's Reserved, and not in the range of pages - * statically defined as 'unsaveable', or if it isn't reserved, and - * isn't part of a free chunk of pages. - */ - -static int saveable(struct zone * zone, unsigned long * zone_pfn) -{ - unsigned long pfn = *zone_pfn + zone->zone_start_pfn; - struct page * page; - - if (!pfn_valid(pfn)) - return 0; - - page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); - if (PageNosave(page)) - return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { - pr_debug("[nosave pfn 0x%lx]", pfn); - return 0; - } - if (PageNosaveFree(page)) - return 0; - - return 1; -} - -static void count_data_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - - nr_copy_pages = 0; - - for_each_zone (zone) { - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - nr_copy_pages += saveable(zone, &zone_pfn); - } -} - - -static void copy_data_pages(void) -{ - struct zone *zone; - unsigned long zone_pfn; - struct pbe * pbe = pagedir_nosave; - - pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); - for_each_zone (zone) { - if (is_highmem(zone)) - continue; - mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - if (saveable(zone, &zone_pfn)) { - struct page * page; - page = pfn_to_page(zone_pfn + zone->zone_start_pfn); - BUG_ON(!pbe); - pbe->orig_address = (long) page_address(page); - /* copy_page is not usable for copying task structs. */ - memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); - pbe = pbe->next; - } - } - } - BUG_ON(pbe); -} - - -/** - * calc_nr - Determine the number of pages needed for a pbe list. - */ - -static int calc_nr(int nr_copy) -{ - int extra = 0; - int mod = !!(nr_copy % PBES_PER_PAGE); - int diff = (nr_copy / PBES_PER_PAGE) + mod; - - do { - extra += diff; - nr_copy += diff; - mod = !!(nr_copy % PBES_PER_PAGE); - diff = (nr_copy / PBES_PER_PAGE) + mod - extra; - } while (diff > 0); - - return nr_copy; -} - -/** - * free_pagedir - free pages allocated with alloc_pagedir() - */ - -static inline void free_pagedir(struct pbe *pblist) -{ - struct pbe *pbe; - - while (pblist) { - pbe = (pblist + PB_PAGE_SKIP)->next; - free_page((unsigned long)pblist); - pblist = pbe; - } -} - -/** - * fill_pb_page - Create a list of PBEs on a given memory page - */ - -static inline void fill_pb_page(struct pbe *pbpage) -{ - struct pbe *p; - - p = pbpage; - pbpage += PB_PAGE_SKIP; - do - p->next = p + 1; - while (++p < pbpage); -} - -/** - * create_pbe_list - Create a list of PBEs on top of a given chain - * of memory pages allocated with alloc_pagedir() - */ - -static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) -{ - struct pbe *pbpage, *p; - unsigned num = PBES_PER_PAGE; - - for_each_pb_page (pbpage, pblist) { - if (num >= nr_pages) - break; - - fill_pb_page(pbpage); - num += PBES_PER_PAGE; - } - if (pbpage) { - for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) - p->next = p + 1; - p->next = NULL; - } - pr_debug("create_pbe_list(): initialized %d PBEs\n", num); -} - -/** - * alloc_pagedir - Allocate the page directory. - * - * First, determine exactly how many pages we need and - * allocate them. - * - * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE - * struct pbe elements (pbes) and the last element in the page points - * to the next page. - * - * On each page we set up a list of struct_pbe elements. - */ - -static struct pbe * alloc_pagedir(unsigned nr_pages) -{ - unsigned num; - struct pbe *pblist, *pbe; - - if (!nr_pages) - return NULL; - - pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); - pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; - pbe = pbe->next, num += PBES_PER_PAGE) { - pbe += PB_PAGE_SKIP; - pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - } - if (!pbe) { /* get_zeroed_page() failed */ - free_pagedir(pblist); - pblist = NULL; - } - return pblist; -} - -/** - * free_image_pages - Free pages allocated for snapshot - */ - -static void free_image_pages(void) -{ - struct pbe * p; - - for_each_pbe (p, pagedir_save) { - if (p->address) { - ClearPageNosave(virt_to_page(p->address)); - free_page(p->address); - p->address = 0; - } - } -} - -/** - * alloc_image_pages - Allocate pages for the snapshot. - */ - -static int alloc_image_pages(void) -{ - struct pbe * p; - - for_each_pbe (p, pagedir_save) { - p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - if (!p->address) - return -ENOMEM; - SetPageNosave(virt_to_page(p->address)); - } - return 0; -} - -void swsusp_free(void) -{ - BUG_ON(PageNosave(virt_to_page(pagedir_save))); - BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); - free_image_pages(); - free_pagedir(pagedir_save); -} - - -/** - * enough_free_mem - Make sure we enough free memory to snapshot. - * - * Returns TRUE or FALSE after checking the number of available - * free pages. - */ - -static int enough_free_mem(void) -{ - if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { - pr_debug("swsusp: Not enough free pages: Have %d\n", - nr_free_pages()); - return 0; - } - return 1; -} - - /** * enough_swap - Make sure we have enough swap to save the image. * @@ -766,83 +543,14 @@ static int enough_free_mem(void) * We should only consider resume_device. */ -static int enough_swap(void) +int enough_swap(unsigned nr_pages) { struct sysinfo i; si_swapinfo(&i); - if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { - pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); - return 0; - } - return 1; -} - -static int swsusp_alloc(void) -{ - int error; - - pagedir_nosave = NULL; - nr_copy_pages = calc_nr(nr_copy_pages); - - pr_debug("suspend: (pages needed: %d + %d free: %d)\n", - nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); - - if (!enough_free_mem()) - return -ENOMEM; - - if (!enough_swap()) - return -ENOSPC; - - if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { - printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); - return -ENOMEM; - } - create_pbe_list(pagedir_save, nr_copy_pages); - pagedir_nosave = pagedir_save; - if ((error = alloc_image_pages())) { - printk(KERN_ERR "suspend: Allocating image pages failed.\n"); - swsusp_free(); - return error; - } - - nr_copy_pages_check = nr_copy_pages; - return 0; -} - -static int suspend_prepare_image(void) -{ - int error; - - pr_debug("swsusp: critical section: \n"); - if (save_highmem()) { - printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); - restore_highmem(); - return -ENOMEM; - } - - drain_local_pages(); - count_data_pages(); - printk("swsusp: Need to copy %u pages\n", nr_copy_pages); - - error = swsusp_alloc(); - if (error) - return error; - - /* During allocating of suspend pagedir, new cold pages may appear. - * Kill them. - */ - drain_local_pages(); - copy_data_pages(); - - /* - * End of critical section. From now on, we can write to memory, - * but we should not touch disk. This specially means we must _not_ - * touch swap space! Except we must write out our image of course. - */ - - printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); - return 0; + pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); + return i.freeswap > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } @@ -854,7 +562,7 @@ static int suspend_prepare_image(void) int swsusp_write(void) { int error; - device_resume(); + lock_swapdevices(); error = write_suspend_image(); /* This will unlock ignored swap devices since writing is finished */ @@ -864,14 +572,6 @@ int swsusp_write(void) } -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - - -asmlinkage int swsusp_save(void) -{ - return suspend_prepare_image(); -} int swsusp_suspend(void) { @@ -886,23 +586,23 @@ int swsusp_suspend(void) * at resume time, and evil weirdness ensues. */ if ((error = device_power_down(PMSG_FREEZE))) { + printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); local_irq_enable(); return error; } if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " - "swapon -a!\n"); + printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); + device_power_up(); local_irq_enable(); return error; } save_processor_state(); if ((error = swsusp_arch_suspend())) - printk("Error %d suspending\n", error); + printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); - BUG_ON (nr_copy_pages_check != nr_copy_pages); restore_highmem(); device_power_up(); local_irq_enable(); @@ -922,8 +622,14 @@ int swsusp_resume(void) * execution continues at place where swsusp_arch_suspend was called */ BUG_ON(!error); + /* The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures + */ + swsusp_free(); restore_processor_state(); restore_highmem(); + touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); return error; @@ -936,54 +642,28 @@ int swsusp_resume(void) * * We don't know which pages are usable until we allocate them. * - * Allocated but unusable (ie eaten) memory pages are linked together - * to create a list, so that we can free them easily - * - * We could have used a type other than (void *) - * for this purpose, but ... + * Allocated but unusable (ie eaten) memory pages are marked so that + * swsusp_free() can release them */ -static void **eaten_memory = NULL; - -static inline void eat_page(void *page) -{ - void **c; - c = eaten_memory; - eaten_memory = page; - *eaten_memory = c; -} - -static unsigned long get_usable_page(unsigned gfp_mask) +unsigned long get_safe_page(gfp_t gfp_mask) { unsigned long m; - m = get_zeroed_page(gfp_mask); - while (!PageNosaveFree(virt_to_page(m))) { - eat_page((void *)m); + do { m = get_zeroed_page(gfp_mask); - if (!m) - break; + if (m && PageNosaveFree(virt_to_page(m))) + /* This is for swsusp_free() */ + SetPageNosave(virt_to_page(m)); + } while (m && PageNosaveFree(virt_to_page(m))); + if (m) { + /* This is for swsusp_free() */ + SetPageNosave(virt_to_page(m)); + SetPageNosaveFree(virt_to_page(m)); } return m; } -static void free_eaten_memory(void) -{ - unsigned long m; - void **c; - int i = 0; - - c = eaten_memory; - while (c) { - m = (unsigned long)c; - c = *c; - free_page(m); - i++; - } - eaten_memory = NULL; - pr_debug("swsusp: %d unused pages freed\n", i); -} - /** * check_pagedir - We ensure here that pages that the PBEs point to * won't collide with pages where we're going to restore from the loaded @@ -1001,7 +681,7 @@ static int check_pagedir(struct pbe *pblist) p->address = 0UL; for_each_pbe (p, pblist) { - p->address = get_usable_page(GFP_ATOMIC); + p->address = get_safe_page(GFP_ATOMIC); if (!p->address) return -ENOMEM; } @@ -1020,7 +700,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) unsigned long zone_pfn; struct pbe *pbpage, *tail, *p; void *m; - int rel = 0, error = 0; + int rel = 0; if (!pblist) /* a sanity check */ return NULL; @@ -1028,41 +708,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", swsusp_info.pagedir_pages); - /* Set page flags */ + /* Clear page flags */ for_each_zone (zone) { for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - SetPageNosaveFree(pfn_to_page(zone_pfn + + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) + ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); } - /* Clear orig addresses */ + /* Mark orig addresses */ for_each_pbe (p, pblist) - ClearPageNosaveFree(virt_to_page(p->orig_address)); + SetPageNosaveFree(virt_to_page(p->orig_address)); tail = pblist + PB_PAGE_SKIP; /* Relocate colliding pages */ for_each_pb_page (pbpage, pblist) { - if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { - m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); - if (!m) { - error = -ENOMEM; - break; - } + if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) { + m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD); + if (!m) + return NULL; memcpy(m, (void *)pbpage, PAGE_SIZE); if (pbpage == pblist) pblist = (struct pbe *)m; else tail->next = (struct pbe *)m; - - eat_page((void *)pbpage); pbpage = (struct pbe *)m; /* We have to link the PBEs again */ - for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) if (p->next) /* needed to save the end */ p->next = p + 1; @@ -1072,14 +748,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) tail = pbpage + PB_PAGE_SKIP; } - if (error) { - printk("\nswsusp: Out of memory\n\n"); - free_pagedir(pblist); - free_eaten_memory(); - pblist = NULL; + /* This is for swsusp_free() */ + for_each_pb_page (pbpage, pblist) { + SetPageNosave(virt_to_page(pbpage)); + SetPageNosaveFree(virt_to_page(pbpage)); } - else - printk("swsusp: Relocated %d pages\n", rel); + + printk("swsusp: Relocated %d pages\n", rel); return pblist; } @@ -1179,7 +854,8 @@ static const char * sanity_check(void) if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) return "machine"; #if 0 - if(swsusp_info.cpus != num_online_cpus()) + /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ + if (swsusp_info.cpus != num_possible_cpus()) return "number of cpus"; #endif return NULL; @@ -1212,13 +888,14 @@ static int check_sig(void) return error; if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); + memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); /* * Reset swap signature now. */ error = bio_write_page(0, &swsusp_header); } else { - printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); return -EINVAL; } if (!error) @@ -1239,6 +916,10 @@ static int data_read(struct pbe *pblist) int error = 0; int i = 0; int mod = swsusp_info.image_pages / 100; + void *tfm; + + if ((error = crypto_init(0, &tfm))) + return error; if (!mod) mod = 1; @@ -1250,14 +931,15 @@ static int data_read(struct pbe *pblist) if (!(i % mod)) printk("\b\b\b\b%3d%%", i / mod); - error = bio_read_page(swp_offset(p->swap_address), - (void *)p->address); - if (error) + if ((error = crypto_read(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -1290,10 +972,8 @@ static int read_pagedir(struct pbe *pblist) break; } - if (error) - free_page((unsigned long)pblist); - - BUG_ON(i != swsusp_info.pagedir_pages); + if (!error) + BUG_ON(i != swsusp_info.pagedir_pages); return error; } @@ -1331,18 +1011,10 @@ static int read_suspend_image(void) /* Allocate memory for the image and read the data from swap */ error = check_pagedir(pagedir_nosave); - free_eaten_memory(); + if (!error) error = data_read(pagedir_nosave); - if (error) { /* We fail cleanly */ - for_each_pbe (p, pagedir_nosave) - if (p->address) { - free_page(p->address); - p->address = 0UL; - } - free_pagedir(pagedir_nosave); - } return error; } @@ -1385,6 +1057,7 @@ int swsusp_read(void) error = read_suspend_image(); blkdev_put(resume_bdev); + memset(key_iv, 0, MAXKEY+MAXIV); if (!error) pr_debug("swsusp: Reading resume file was successful\n"); diff --git a/kernel/printk.c b/kernel/printk.c index 5092397fac29..3cb9708209bc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -10,7 +10,7 @@ * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. - * Fixed SMP synchronization, 08/08/99, Manfred Spraul + * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfreds@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton <andrewm@uow.edu.au> @@ -148,7 +148,7 @@ static int __init console_setup(char *str) if (!strcmp(str, "ttyb")) strcpy(name, "ttyS1"); #endif - for(s = name; *s; s++) + for (s = name; *s; s++) if ((*s >= '0' && *s <= '9') || *s == ',') break; idx = simple_strtoul(s, NULL, 10); @@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str) size = roundup_pow_of_two(size); if (size > log_buf_len) { unsigned long start, dest_idx, offset; - char * new_log_buf; + char *new_log_buf; new_log_buf = alloc_bootmem(size); if (!new_log_buf) { - printk("log_buf_len: allocation failed\n"); + printk(KERN_WARNING "log_buf_len: allocation failed\n"); goto out; } @@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str) log_end -= offset; spin_unlock_irqrestore(&logbuf_lock, flags); - printk("log_buf_len: %d\n", log_buf_len); + printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); } out: - return 1; } @@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup); * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user * buf, int len) +int do_syslog(int type, char __user *buf, int len) { unsigned long i, j, limit, count; int do_clear = 0; @@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len) error = -EFAULT; goto out; } - error = wait_event_interruptible(log_wait, (log_start - log_end)); + error = wait_event_interruptible(log_wait, + (log_start - log_end)); if (error) goto out; i = 0; @@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len) error = i; break; case 4: /* Read/clear last kernel messages */ - do_clear = 1; + do_clear = 1; /* FALL THRU */ case 3: /* Read last kernel messages */ error = -EINVAL; @@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len) limit = log_end; /* * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages + * printk() could overwrite the messages * we try to copy to user space. Therefore * the messages are copied in reverse. <manfreds> */ - for(i = 0; i < count && !error; i++) { + for (i = 0; i < count && !error; i++) { j = limit-1-i; if (j + log_buf_len < log_end) break; @@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len) if (error) break; error = i; - if(i != count) { + if (i != count) { int offset = count-error; /* buffer overflow during copy, correct user buffer. */ - for(i=0;i<error;i++) { + for (i = 0; i < error; i++) { if (__get_user(c,&buf[i+offset]) || __put_user(c,&buf[i])) { error = -EFAULT; @@ -351,7 +351,7 @@ out: return error; } -asmlinkage long sys_syslog(int type, char __user * buf, int len) +asmlinkage long sys_syslog(int type, char __user *buf, int len) { return do_syslog(type, buf, len); } @@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end) cur_index = start; start_print = start; while (cur_index != end) { - if ( msg_level < 0 && - ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') - { + if (msg_level < 0 && ((end - cur_index) > 2) && + LOG_BUF(cur_index + 0) == '<' && + LOG_BUF(cur_index + 1) >= '0' && + LOG_BUF(cur_index + 1) <= '7' && + LOG_BUF(cur_index + 2) == '>') { msg_level = LOG_BUF(cur_index + 1) - '0'; cur_index += 3; start_print = cur_index; } while (cur_index != end) { char c = LOG_BUF(cur_index); - cur_index++; + cur_index++; if (c == '\n') { if (msg_level < 0) { /* @@ -461,7 +459,7 @@ static void zap_locks(void) static unsigned long oops_timestamp; if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30*HZ)) + !time_after(jiffies, oops_timestamp + 30 * HZ)) return; oops_timestamp = jiffies; @@ -488,9 +486,14 @@ static int __init printk_time_setup(char *str) __setup("time", printk_time_setup); +__attribute__((weak)) unsigned long long printk_clock(void) +{ + return sched_clock(); +} + /* * This is printk. It can be called from any context. We want it to work. - * + * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output * into the log buffer and return. The current holder of the console_sem will @@ -514,6 +517,9 @@ asmlinkage int printk(const char *fmt, ...) return r; } +/* cpu currently holding logbuf_lock */ +static volatile unsigned int printk_cpu = UINT_MAX; + asmlinkage int vprintk(const char *fmt, va_list args) { unsigned long flags; @@ -522,11 +528,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) static char printk_buf[1024]; static int log_level_unknown = 1; - if (unlikely(oops_in_progress)) + preempt_disable(); + if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) + /* If a crash is occurring during printk() on this CPU, + * make sure we can't deadlock */ zap_locks(); /* This stops the holder of console_sem just where we want him */ spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -558,7 +568,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) loglev_char = default_message_loglevel + '0'; } - t = sched_clock(); + t = printk_clock(); nanosec_rem = do_div(t, 1000000000); tlen = sprintf(tbuf, "<%c>[%5lu.%06lu] ", @@ -595,6 +605,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * CPU until it is officially up. We shouldn't be calling into * random console drivers on a CPU which doesn't exist yet.. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); goto out; } @@ -604,6 +615,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * We own the drivers. We can drop the spinlock and let * release_console_sem() print the text */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); console_may_schedule = 0; release_console_sem(); @@ -613,9 +625,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) * allows the semaphore holder to proceed and to call the * console drivers with the output which we just produced. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } out: + preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); @@ -623,13 +637,19 @@ EXPORT_SYMBOL(vprintk); #else -asmlinkage long sys_syslog(int type, char __user * buf, int len) +asmlinkage long sys_syslog(int type, char __user *buf, int len) { return 0; } -int do_syslog(int type, char __user * buf, int len) { return 0; } -static void call_console_drivers(unsigned long start, unsigned long end) {} +int do_syslog(int type, char __user *buf, int len) +{ + return 0; +} + +static void call_console_drivers(unsigned long start, unsigned long end) +{ +} #endif @@ -835,9 +855,9 @@ EXPORT_SYMBOL(console_start); * print any messages that were printed by the kernel before the * console driver was initialized. */ -void register_console(struct console * console) +void register_console(struct console *console) { - int i; + int i; unsigned long flags; if (preferred_console < 0) @@ -862,7 +882,8 @@ void register_console(struct console * console) * See if this console matches one we selected on * the command line. */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; + i++) { if (strcmp(console_cmdline[i].name, console->name) != 0) continue; if (console->index >= 0 && @@ -917,9 +938,9 @@ void register_console(struct console * console) } EXPORT_SYMBOL(register_console); -int unregister_console(struct console * console) +int unregister_console(struct console *console) { - struct console *a,*b; + struct console *a, *b; int res = 1; acquire_console_sem(); @@ -933,10 +954,10 @@ int unregister_console(struct console * console) b->next = a->next; res = 0; break; - } + } } } - + /* If last console is removed, we re-enable picking the first * one that gets registered. Without that, pmac early boot console * would prevent fbcon from taking over. @@ -978,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned long toks = 10*5*HZ; + static unsigned long toks = 10 * 5 * HZ; static unsigned long last_msg; static int missed; unsigned long flags; @@ -991,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) toks = ratelimit_burst * ratelimit_jiffies; if (toks >= ratelimit_jiffies) { int lost = missed; + missed = 0; toks -= ratelimit_jiffies; spin_unlock_irqrestore(&ratelimit_lock, flags); @@ -1005,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) EXPORT_SYMBOL(__printk_ratelimit); /* minimum time in jiffies between messages */ -int printk_ratelimit_jiffies = 5*HZ; +int printk_ratelimit_jiffies = 5 * HZ; /* number of messages we send before ratelimiting */ int printk_ratelimit_burst = 10; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8dcb8f6288bc..863eee8bff47 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child) signal_wake_up(child, 1); } } + if (child->signal->flags & SIGNAL_GROUP_EXIT) { + sigaddset(&child->pending.signal, SIGKILL); + signal_wake_up(child, 1); + } spin_unlock(&child->sighand->siglock); } @@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child) SET_LINKS(child); } - if (child->state == TASK_TRACED) - ptrace_untrace(child); + ptrace_untrace(child); } /* @@ -118,6 +121,33 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } +static int may_attach(struct task_struct *task) +{ + if (!task->mm) + return -EPERM; + if (((current->uid != task->euid) || + (current->uid != task->suid) || + (current->uid != task->uid) || + (current->gid != task->egid) || + (current->gid != task->sgid) || + (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + return -EPERM; + smp_rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; + + return security_ptrace(current, task); +} + +int ptrace_may_attach(struct task_struct *task) +{ + int err; + task_lock(task); + err = may_attach(task); + task_unlock(task); + return !err; +} + int ptrace_attach(struct task_struct *task) { int retval; @@ -127,22 +157,10 @@ int ptrace_attach(struct task_struct *task) goto bad; if (task == current) goto bad; - if (!task->mm) - goto bad; - if(((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) - goto bad; - smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) - goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = security_ptrace(current, task); + retval = may_attach(task); if (retval) goto bad; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..c4d159a21e04 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/rcupdate.h> +#include <linux/rcuref.h> #include <linux/cpu.h> /* Definition for rcupdate control block. */ @@ -70,7 +71,20 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; /* Fake initialization required by compiler */ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int maxbatch = 10; +static int maxbatch = 10000; + +#ifndef __HAVE_ARCH_CMPXCHG +/* + * We use an array of spinlocks for the rcurefs -- similar to ones in sparc + * 32 bit atomic_t implementations, and a hash function similar to that + * for our refcounting needs. + * Can't help multiprocessors which donot have cmpxchg :( + */ + +spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { + [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED +}; +#endif /** * call_rcu - Queue an RCU callback for invocation after a grace period. @@ -95,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head, rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; + + if (unlikely(++rdp->count > 10000)) + set_need_resched(); + local_irq_restore(flags); } @@ -126,10 +144,25 @@ void fastcall call_rcu_bh(struct rcu_head *head, rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; + rdp->count++; +/* + * Should we directly call rcu_do_batch() here ? + * if (unlikely(rdp->count > 10000)) + * rcu_do_batch(rdp); + */ local_irq_restore(flags); } /* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed(void) +{ + return rcu_ctrlblk.completed; +} + +/* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. */ @@ -143,6 +176,7 @@ static void rcu_do_batch(struct rcu_data *rdp) next = rdp->donelist = list->next; list->func(list); list = next; + rdp->count--; if (++count >= maxbatch) break; } @@ -476,6 +510,7 @@ void synchronize_kernel(void) } module_param(maxbatch, int, 0); +EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 000000000000..9b58f1eff3ca --- /dev/null +++ b/kernel/rcutorture.c @@ -0,0 +1,492 @@ +/* + * Read-Copy Update /proc-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * + * See also: Documentation/RCU/torture.txt + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <asm/atomic.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/rcuref.h> +#include <linux/cpu.h> +#include <linux/random.h> +#include <linux/delay.h> +#include <linux/byteorder/swabb.h> +#include <linux/stat.h> + +MODULE_LICENSE("GPL"); + +static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ +static int stat_interval = 0; /* Interval between stats, in seconds. */ + /* Defaults to "only at end of test". */ +static int verbose = 0; /* Print more debug info. */ + +MODULE_PARM(nreaders, "i"); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +MODULE_PARM(stat_interval, "i"); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +MODULE_PARM(verbose, "i"); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); +#define TORTURE_FLAG "rcutorture: " +#define PRINTK_STRING(s) \ + do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) + +static char printk_buf[4096]; + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; +}; + +static int fullstop = 0; /* stop generating callbacks at test end. */ +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture *rcu_torture_current = NULL; +static long rcu_torture_current_version = 0; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = + { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = + { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +atomic_t n_rcu_torture_alloc; +atomic_t n_rcu_torture_alloc_fail; +atomic_t n_rcu_torture_free; + +/* + * Allocate an element from the rcu_tortures pool. + */ +struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock(&rcu_torture_lock); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) + rcu_torture_free(rp); + else + call_rcu(p, rcu_torture_cb); +} + +struct rcu_random_state { + unsigned long rrs_state; + unsigned long rrs_count; +}; + +#define RCU_RANDOM_MULT 39916801 /* prime */ +#define RCU_RANDOM_ADD 479001701 /* prime */ +#define RCU_RANDOM_REFRESH 10000 + +#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static long +rcu_random(struct rcu_random_state *rrsp) +{ + long refresh; + + if (--rrsp->rrs_count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rrsp->rrs_state += refresh; + rrsp->rrs_count = RCU_RANDOM_REFRESH; + } + rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; + return swahw32(rrsp->rrs_state); +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + int i; + long oldbatch = rcu_batches_completed(); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + do { + schedule_timeout_uninterruptible(1); + if (rcu_batches_completed() == oldbatch) + continue; + if ((rp = rcu_torture_alloc()) == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(rcu_random(&rand) & 0x3ff); + old_rp = rcu_torture_current; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); + if (old_rp != NULL) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); + } + rcu_torture_current_version++; + oldbatch = rcu_batches_completed(); + } while (!kthread_should_stop() && !fullstop); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + DEFINE_RCU_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + do { + rcu_read_lock(); + completed = rcu_batches_completed(); + p = rcu_dereference(rcu_torture_current); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + rcu_read_unlock(); + schedule_timeout_interruptible(HZ); + continue; + } + udelay(rcu_random(&rand) & 0x7f); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_count)[pipe_count]; + completed = rcu_batches_completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_batch)[completed]; + preempt_enable(); + rcu_read_unlock(); + schedule(); + } while (!kthread_should_stop() && !fullstop); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static int +rcu_torture_printk(char *page) +{ + int cnt = 0; + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + cnt += sprintf(&page[cnt], "rcutorture: "); + cnt += sprintf(&page[cnt], + "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + if (i > 1) + cnt += sprintf(&page[cnt], "!!! "); + cnt += sprintf(&page[cnt], "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); + cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + cnt += sprintf(&page[cnt], " %d", + atomic_read(&rcu_torture_wcount[i])); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int cnt; + + cnt = rcu_torture_printk(printk_buf); + printk(KERN_ALERT "%s", printk_buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static void +rcu_torture_cleanup(void) +{ + int i; + + fullstop = 1; + if (writer_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks != NULL) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (stats_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + /* Wait for all RCU callbacks to fire. */ + + for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + synchronize_rcu(); + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + PRINTK_STRING("--- End of test"); +} + +static int +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + + /* Process args and tell the world that the torturer is on the job. */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + printk(KERN_ALERT TORTURE_FLAG + "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", + nrealreaders, stat_interval, verbose); + fullstop = 0; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_run(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + return 0; + +unwind: + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/resource.c b/kernel/resource.c index 26967e042201..92285d822de6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource); */ struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) { - struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { - memset(res, 0, sizeof(*res)); res->name = name; res->start = start; res->end = start + n - 1; diff --git a/kernel/sched.c b/kernel/sched.c index a646e4f36c41..b4f4eb613537 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) { +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif spin_unlock_irq(&rq->lock); } @@ -875,7 +879,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +void wait_task_inactive(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -966,8 +970,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; int i; + /* Skip over this group if it has no CPUs allowed */ + if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + goto nextgroup; + local_group = cpu_isset(this_cpu, group->cpumask); - /* XXX: put a cpus allowed check */ /* Tally up the load of all CPUs in the group */ avg_load = 0; @@ -992,6 +999,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) min_load = avg_load; idlest = group; } +nextgroup: group = group->next; } while (group != sd->groups); @@ -1003,13 +1011,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int find_idlest_cpu(struct sched_group *group, int this_cpu) +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { + cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; - for_each_cpu_mask(i, group->cpumask) { + /* Traverse only the allowed CPUs */ + cpus_and(tmp, group->cpumask, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { load = source_load(i, 0); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1052,7 +1065,7 @@ static int sched_balance_self(int cpu, int flag) if (!group) goto nextlevel; - new_cpu = find_idlest_cpu(group, cpu); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) goto nextlevel; @@ -1127,7 +1140,7 @@ static inline int wake_idle(int cpu, task_t *p) * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t *p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1252,6 +1265,16 @@ out_activate: } /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, cpu == this_cpu); + /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -1259,7 +1282,6 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1274,7 +1296,7 @@ out: return success; } -int fastcall wake_up_process(task_t * p) +int fastcall wake_up_process(task_t *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); @@ -1353,7 +1375,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1436,7 +1458,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +void fastcall sched_exit(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -1446,7 +1468,7 @@ void fastcall sched_exit(task_t * p) * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { + if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { p->parent->time_slice += p->time_slice; if (unlikely(p->parent->time_slice > task_timeslice(p))) p->parent->time_slice = task_timeslice(p); @@ -1478,6 +1500,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) /** * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, int *all_pinned) + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { /* * We do not migrate tasks that are: @@ -1882,10 +1906,11 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) + unsigned long *imbalance, enum idle_type idle, int *sd_idle) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; int load_idx; max_load = this_load = total_load = total_pwr = 0; @@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load) + if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + /* How much load to actually move to equalise the imbalance */ - *imbalance = min((max_load - avg_load) * busiest->cpu_power, + *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; @@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, unsigned long imbalance; int nr_moved, all_pinned = 0; int active_balance = 0; + int sd_idle = 0; + + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle); + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, - &all_pinned); - spin_unlock(&busiest->lock); + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) goto out_balanced; } - spin_unlock(&this_rq->lock); - if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { spin_lock(&busiest->lock); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + spin_unlock(&busiest->lock); + all_pinned = 1; + goto out_one_pinned; + } + if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; @@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; + +out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return 0; } @@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; + int sd_idle = 0; + + if (sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, BUG_ON(busiest == this_rq); - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); - nr_moved = move_tasks(this_rq, this_cpu, busiest, + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE, NULL); - if (!nr_moved) + spin_unlock(&busiest->lock); + } + + if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - else + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + } else sd->nr_balance_failed = 0; - spin_unlock(&busiest->lock); return nr_moved; out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; sd->nr_balance_failed = 0; return 0; } @@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so no longer idle */ + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ idle = NOT_IDLE; } sd->last_balance += interval; @@ -2449,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->idle = cputime64_add(cpustat->idle, tmp); /* Account for system time used */ acct_update_integrals(p); - /* Update rss highwater mark */ - update_mem_hiwater(p); } /* @@ -2575,6 +2635,13 @@ out: } #ifdef CONFIG_SCHED_SMT +static inline void wakeup_busy_runqueue(runqueue_t *rq) +{ + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ + if (rq->curr == rq->idle && rq->nr_running) + resched_task(rq->idle); +} + static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2608,12 +2675,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); + wakeup_busy_runqueue(smt_rq); } for_each_cpu_mask(i, sibling_map) @@ -2624,6 +2686,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) */ } +/* + * number of 'lost' timeslices this task wont be able to fully + * utilize, if another task runs on a sibling. This models the + * slowdown effect of other tasks running on siblings: + */ +static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +{ + return p->time_slice * (100 - sd->per_cpu_gain) / 100; +} + static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2667,6 +2739,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) runqueue_t *smt_rq = cpu_rq(i); task_t *smt_curr = smt_rq->curr; + /* Kernel threads do not participate in dependent sleeping */ + if (!p->mm || !smt_curr->mm || rt_task(p)) + goto check_smt_task; + /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, @@ -2675,21 +2751,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) - ret = 1; + if (rt_task(smt_curr)) { + /* + * With real time tasks we run non-rt tasks only + * per_cpu_gain% of the time. + */ + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + ret = 1; + } else + if (smt_curr->static_prio < p->static_prio && + !TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(smt_curr, sd) > task_timeslice(p)) + ret = 1; + +check_smt_task: + if ((!smt_curr->mm && smt_curr != smt_rq->idle) || + rt_task(smt_curr)) + continue; + if (!p->mm) { + wakeup_busy_runqueue(smt_rq); + continue; + } /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. + * Reschedule a lower priority task on the SMT sibling for + * it to be put to sleep, or wake it up if it has been put to + * sleep for priority reasons to see if it should run now. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || - (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); + if (rt_task(p)) { + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + resched_task(smt_curr); + } else { + if (TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(p, sd) > task_timeslice(smt_curr)) + resched_task(smt_curr); + else + wakeup_busy_runqueue(smt_rq); + } } out_unlock: for_each_cpu_mask(i, sibling_map) @@ -2887,6 +2987,7 @@ switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); + prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); @@ -3014,7 +3115,8 @@ need_resched: #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) { task_t *p = curr->private; return try_to_wake_up(p, mode, sync); @@ -3056,7 +3158,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @key: is directly passed to the wakeup function */ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) + int nr_exclusive, void *key) { unsigned long flags; @@ -3088,7 +3190,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void fastcall +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; int sync = 1; @@ -3279,7 +3382,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -3378,8 +3482,8 @@ EXPORT_SYMBOL(set_user_nice); */ int can_nice(const task_t *p, const int nice) { - /* convert nice value [19,-20] to rlimit style value [0,39] */ - int nice_rlim = 19 - nice; + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || capable(CAP_SYS_NICE)); } @@ -3498,7 +3602,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @policy: new policy. * @param: structure containing the new RT priority. */ -int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) { int retval; int oldprio, oldpolicy = -1; @@ -3518,7 +3623,7 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) @@ -3581,7 +3686,8 @@ recheck: } EXPORT_SYMBOL_GPL(sched_setscheduler); -static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { int retval; struct sched_param lparam; @@ -3848,7 +3954,7 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { + if (array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); @@ -3912,7 +4018,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int cond_resched_lock(spinlock_t *lock) { int ret = 0; @@ -4095,7 +4201,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) return list_entry(p->sibling.next,struct task_struct,sibling); } -static void show_task(task_t * p) +static void show_task(task_t *p) { task_t *relative; unsigned state; @@ -4121,7 +4227,7 @@ static void show_task(task_t * p) #endif #ifdef CONFIG_DEBUG_STACK_USAGE { - unsigned long * n = (unsigned long *) (p->thread_info+1); + unsigned long *n = (unsigned long *) (p->thread_info+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p->thread_info+1); @@ -4330,7 +4436,7 @@ out: * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ -static int migration_thread(void * data) +static int migration_thread(void *data) { runqueue_t *rq; int cpu = (long)data; @@ -4779,7 +4885,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void cpu_attach_domain(struct sched_domain *sd, int cpu) +static void cpu_attach_domain(struct sched_domain *sd, int cpu) { runqueue_t *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -4802,7 +4908,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -4830,8 +4936,8 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, + int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; @@ -4864,12 +4970,85 @@ void init_sched_build_groups(struct sched_group groups[], last->next = first; } +#define SD_NODES_PER_DOMAIN 16 -#ifdef ARCH_HAS_SCHED_DOMAIN -extern void build_sched_domains(const cpumask_t *cpu_map); -extern void arch_init_sched_domains(const cpumask_t *cpu_map); -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); -#else +#ifdef CONFIG_NUMA +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t sched_domain_node_span(int node) +{ + int i; + cpumask_t span, nodemask; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + nodemask = node_to_cpumask(node); + cpus_or(span, span, nodemask); + set_bit(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#endif + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; @@ -4891,36 +5070,20 @@ static int cpu_to_phys_group(int cpu) } #ifdef CONFIG_NUMA - -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int cpu_to_node_group(int cpu) -{ - return cpu_to_node(cpu); -} -#endif - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) /* - * The domains setup code relies on siblings not spanning - * multiple nodes. Make sure the architecture has a proper - * siblings map: + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. */ -static void check_sibling_maps(void) -{ - int i, j; +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; - for_each_online_cpu(i) { - for_each_cpu_mask(j, cpu_sibling_map[i]) { - if (cpu_to_node(i) != cpu_to_node(j)) { - printk(KERN_INFO "warning: CPU %d siblings map " - "to different node - isolating " - "them.\n", i); - cpu_sibling_map[i] = cpumask_of_cpu(i); - break; - } - } - } +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; + +static int cpu_to_allnodes_group(int cpu) +{ + return cpu_to_node(cpu); } #endif @@ -4928,9 +5091,24 @@ static void check_sibling_maps(void) * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static void build_sched_domains(const cpumask_t *cpu_map) +void build_sched_domains(const cpumask_t *cpu_map) { int i; +#ifdef CONFIG_NUMA + struct sched_group **sched_group_nodes = NULL; + struct sched_group *sched_group_allnodes = NULL; + + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_ATOMIC); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return; + } + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. @@ -4943,11 +5121,35 @@ static void build_sched_domains(const cpumask_t *cpu_map) cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA + if (cpus_weight(*cpu_map) + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (!sched_group_allnodes) { + sched_group_allnodes + = kmalloc(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL); + if (!sched_group_allnodes) { + printk(KERN_WARNING + "Can not alloc allnodes sched group\n"); + break; + } + sched_group_allnodes_bycpu[i] + = sched_group_allnodes; + } + sd = &per_cpu(allnodes_domains, i); + *sd = SD_ALLNODES_INIT; + sd->span = *cpu_map; + group = cpu_to_allnodes_group(i); + sd->groups = &sched_group_allnodes[group]; + p = sd; + } else + p = NULL; + sd = &per_cpu(node_domains, i); - group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = *cpu_map; - sd->groups = &sched_group_nodes[group]; + sd->span = sched_domain_node_span(cpu_to_node(i)); + sd->parent = p; + cpus_and(sd->span, sd->span, *cpu_map); #endif p = sd; @@ -4972,7 +5174,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) @@ -4997,8 +5199,77 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, *cpu_map, - &cpu_to_node_group); + if (sched_group_allnodes) + init_sched_build_groups(sched_group_allnodes, *cpu_map, + &cpu_to_allnodes_group); + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, *cpu_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, *cpu_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } #endif /* Calculate CPU power for physical packages and nodes */ @@ -5017,14 +5288,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) sd->groups->cpu_power = power; #ifdef CONFIG_NUMA - if (i == first_cpu(sd->groups->cpumask)) { - /* Only add "power" once for each physical package. */ - sd = &per_cpu(node_domains, i); - sd->groups->cpu_power += power; + sd = &per_cpu(allnodes_domains, i); + if (sd->groups) { + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; } #endif } +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + /* Attach the domains */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; @@ -5039,13 +5342,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(cpumask_t *cpu_map) +static void arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to @@ -5058,10 +5358,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { - /* Do nothing: everything is statically allocated. */ -} +#ifdef CONFIG_NUMA + int i; + int cpu; -#endif /* ARCH_HAS_SCHED_DOMAIN */ + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif +} /* * Detach sched domains from a group of cpus specified in cpu_map @@ -5263,3 +5600,47 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} + +#endif diff --git a/kernel/signal.c b/kernel/signal.c index ca1186eef938..1bf3c39d6109 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask) return sig; } -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, +static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; @@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __n } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->lock = NULL; q->user = get_uid(t->user); } return(q); @@ -397,20 +396,8 @@ void __exit_signal(struct task_struct *tsk) flush_sigqueue(&tsk->pending); if (sig) { /* - * We are cleaning up the signal_struct here. We delayed - * calling exit_itimers until after flush_sigqueue, just in - * case our thread-local pending queue contained a queued - * timer signal that would have been cleared in - * exit_itimers. When that called sigqueue_free, it would - * attempt to re-take the tasklist_lock and deadlock. This - * can never happen if we ensure that all queues the - * timer's signal might be queued on have been flushed - * first. The shared_pending queue, and our own pending - * queue are the only queues the timer could be on, since - * there are no other threads left in the group and timer - * signals are constrained to threads inside the group. + * We are cleaning up the signal_struct here. */ - exit_itimers(sig); exit_thread_group_keys(sig); kmem_cache_free(signal_cachep, sig); } @@ -418,6 +405,8 @@ void __exit_signal(struct task_struct *tsk) void exit_signal(struct task_struct *tsk) { + atomic_dec(&tsk->signal->live); + write_lock_irq(&tasklist_lock); __exit_signal(tsk); write_unlock_irq(&tasklist_lock); @@ -578,7 +567,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } if ( signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && @@ -661,8 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; error = -EPERM; - if ((!info || ((unsigned long)info != 1 && - (unsigned long)info != 2 && SI_FROMUSER(info))) + if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) @@ -678,7 +667,7 @@ static int check_kill_permission(int sig, struct siginfo *info, /* forward decl */ static void do_notify_parent_cldstop(struct task_struct *tsk, - struct task_struct *parent, + int to_self, int why); /* @@ -692,7 +681,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) { struct task_struct *t; - if (p->flags & SIGNAL_GROUP_EXIT) + if (p->signal->flags & SIGNAL_GROUP_EXIT) /* * The process is in the middle of dying already. */ @@ -729,14 +718,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->group_stop_count = 0; p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_STOPPED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_STOPPED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); @@ -777,14 +759,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_CONTINUED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_CONTINUED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); spin_lock(&p->sighand->siglock); } else { /* @@ -813,7 +788,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. */ - if ((unsigned long)info == 2) + if (info == SEND_SIG_FORCED) goto out_set; /* Real-time signals must be queued if sent by sigqueue, or @@ -825,19 +800,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, pass on the info struct. */ q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - ((unsigned long) info < 2 || + (is_si_special(info) || info->si_code >= 0))); if (q) { list_add_tail(&q->list, &signals->list); switch ((unsigned long) info) { - case 0: + case (unsigned long) SEND_SIG_NOINFO: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = current->pid; q->info.si_uid = current->uid; break; - case 1: + case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -848,20 +823,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, copy_siginfo(&q->info, info); break; } - } else { - if (sig >= SIGRTMIN && info && (unsigned long)info != 1 - && info->si_code != SI_USER) + } else if (!is_si_special(info)) { + if (sig >= SIGRTMIN && info->si_code != SI_USER) /* * Queue overflow, abort. We may abort if the signal was rt * and sent by user using something other than kill(). */ return -EAGAIN; - if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped - * the signal. - */ - ret = info->si_sys_private; } out_set: @@ -882,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) BUG(); assert_spin_locked(&t->sighand->siglock); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped the signal. - */ - ret = info->si_sys_private; - /* Short-circuit ignored signals. */ if (sig_ignored(t, sig)) goto out; @@ -917,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) int ret; spin_lock_irqsave(&t->sighand->siglock, flags); - if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { + if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; + } + if (sigismember(&t->blocked, sig)) { sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); } + recalc_sigpending_tsk(t); ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -931,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) void force_sig_specific(int sig, struct task_struct *t) { - unsigned long int flags; - - spin_lock_irqsave(&t->sighand->siglock, flags); - if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) - t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; - sigdelset(&t->blocked, sig); - recalc_sigpending_tsk(t); - specific_send_sig_info(sig, (void *)2, t); - spin_unlock_irqrestore(&t->sighand->siglock, flags); + force_sig_info(sig, SEND_SIG_FORCED, t); } /* @@ -950,34 +906,31 @@ force_sig_specific(int sig, struct task_struct *t) * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ -#define wants_signal(sig, p, mask) \ - (!sigismember(&(p)->blocked, sig) \ - && !((p)->state & mask) \ - && !((p)->flags & PF_EXITING) \ - && (task_curr(p) || !signal_pending(p))) - +static inline int wants_signal(int sig, struct task_struct *p) +{ + if (sigismember(&p->blocked, sig)) + return 0; + if (p->flags & PF_EXITING) + return 0; + if (sig == SIGKILL) + return 1; + if (p->state & (TASK_STOPPED | TASK_TRACED)) + return 0; + return task_curr(p) || !signal_pending(p); +} static void __group_complete_signal(int sig, struct task_struct *p) { - unsigned int mask; struct task_struct *t; /* - * Don't bother traced and stopped tasks (but - * SIGKILL will punch through that). - */ - mask = TASK_STOPPED | TASK_TRACED; - if (sig == SIGKILL) - mask = 0; - - /* * Now find a thread we can wake up to take the signal off the queue. * * If the main thread wants the signal, it gets first crack. * Probably the least surprising to the average bear. */ - if (wants_signal(sig, p, mask)) + if (wants_signal(sig, p)) t = p; else if (thread_group_empty(p)) /* @@ -995,7 +948,7 @@ __group_complete_signal(int sig, struct task_struct *p) t = p->signal->curr_target = p; BUG_ON(t->tgid != p->tgid); - while (!wants_signal(sig, t, mask)) { + while (!wants_signal(sig, t)) { t = next_thread(t); if (t == p->signal->curr_target) /* @@ -1077,12 +1030,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) assert_spin_locked(&p->sighand->siglock); handle_stop_signal(sig, p); - if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) - /* - * Set up a return to indicate that we dropped the signal. - */ - ret = info->si_sys_private; - /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) return ret; @@ -1135,8 +1082,8 @@ void zap_other_threads(struct task_struct *p) if (t != p->group_leader) t->exit_signal = -1; + /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); signal_wake_up(t, 1); } } @@ -1209,6 +1156,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } +/* like kill_proc_info(), but doesn't use uid/euid of "current" */ +int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, + uid_t uid, uid_t euid) +{ + int ret = -EINVAL; + struct task_struct *p; + + if (!valid_signal(sig)) + return ret; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) { + ret = -ESRCH; + goto out_unlock; + } + if ((!info || ((unsigned long)info != 1 && + (unsigned long)info != 2 && SI_FROMUSER(info))) + && (euid != p->suid) && (euid != p->uid) + && (uid != p->suid) && (uid != p->uid)) { + ret = -EPERM; + goto out_unlock; + } + if (sig && p->sighand) { + unsigned long flags; + spin_lock_irqsave(&p->sighand->siglock, flags); + ret = __group_send_sig_info(sig, info, p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +out_unlock: + read_unlock(&tasklist_lock); + return ret; +} +EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1278,10 +1259,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) return ret; } +#define __si_special(priv) \ + ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) + int send_sig(int sig, struct task_struct *p, int priv) { - return send_sig_info(sig, (void*)(long)(priv != 0), p); + return send_sig_info(sig, __si_special(priv), p); } /* @@ -1301,7 +1285,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) void force_sig(int sig, struct task_struct *p) { - force_sig_info(sig, (void*)1L, p); + force_sig_info(sig, SEND_SIG_PRIV, p); } /* @@ -1326,13 +1310,13 @@ force_sigsegv(int sig, struct task_struct *p) int kill_pg(pid_t pgrp, int sig, int priv) { - return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); + return kill_pg_info(sig, __si_special(priv), pgrp); } int kill_proc(pid_t pid, int sig, int priv) { - return kill_proc_info(sig, (void *)(long)(priv != 0), pid); + return kill_proc_info(sig, __si_special(priv), pid); } /* @@ -1363,11 +1347,12 @@ void sigqueue_free(struct sigqueue *q) * pending queue. */ if (unlikely(!list_empty(&q->list))) { - read_lock(&tasklist_lock); - spin_lock_irqsave(q->lock, flags); + spinlock_t *lock = ¤t->sighand->siglock; + read_lock(&tasklist_lock); + spin_lock_irqsave(lock, flags); if (!list_empty(&q->list)) list_del_init(&q->list); - spin_unlock_irqrestore(q->lock, flags); + spin_unlock_irqrestore(lock, flags); read_unlock(&tasklist_lock); } q->flags &= ~SIGQUEUE_PREALLOC; @@ -1380,16 +1365,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) unsigned long flags; int ret = 0; - /* - * We need the tasklist lock even for the specific - * thread case (when we don't need to follow the group - * lists) in order to avoid races with "p->sighand" - * going away or changing from under us. - */ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); + read_lock(&tasklist_lock); + + if (unlikely(p->flags & PF_EXITING)) { + ret = -1; + goto out_err; + } + spin_lock_irqsave(&p->sighand->siglock, flags); - + if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment @@ -1399,14 +1384,13 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) BUG(); q->info.si_overrun++; goto out; - } + } /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) { ret = 1; goto out; } - q->lock = &p->sighand->siglock; list_add_tail(&q->list, &p->pending.list); sigaddset(&p->pending.signal, sig); if (!sigismember(&p->blocked, sig)) @@ -1414,8 +1398,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) out: spin_unlock_irqrestore(&p->sighand->siglock, flags); +out_err: read_unlock(&tasklist_lock); - return(ret); + + return ret; } int @@ -1452,7 +1438,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) * We always use the shared queue for process-wide signals, * to avoid several races. */ - q->lock = &p->sighand->siglock; list_add_tail(&q->list, &p->signal->shared_pending.list); sigaddset(&p->signal->shared_pending.signal, sig); @@ -1542,14 +1527,20 @@ void do_notify_parent(struct task_struct *tsk, int sig) spin_unlock_irqrestore(&psig->siglock, flags); } -static void -do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, - int why) +static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) { struct siginfo info; unsigned long flags; + struct task_struct *parent; struct sighand_struct *sighand; + if (to_self) + parent = tsk->parent; + else { + tsk = tsk->group_leader; + parent = tsk->real_parent; + } + info.si_signo = SIGCHLD; info.si_errno = 0; info.si_pid = tsk->pid; @@ -1618,8 +1609,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) !(current->ptrace & PT_ATTACHED)) && (likely(current->parent->signal != current->signal) || !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { - do_notify_parent_cldstop(current, current->parent, - CLD_TRAPPED); + do_notify_parent_cldstop(current, 1, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { @@ -1668,25 +1658,25 @@ void ptrace_notify(int exit_code) static void finish_stop(int stop_count) { + int to_self; + /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, current->parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } - else if (stop_count == 0) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, - current->group_leader->real_parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } + if (stop_count < 0 || (current->ptrace & PT_PTRACED)) + to_self = 1; + else if (stop_count == 0) + to_self = 0; + else + goto out; + + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, to_self, CLD_STOPPED); + read_unlock(&tasklist_lock); +out: schedule(); /* * Now we don't run again until continued. @@ -1773,7 +1763,8 @@ do_signal_stop(int signr) * stop is always done with the siglock held, * so this check has no races. */ - if (t->state < TASK_STOPPED) { + if (!t->exit_state && + !(t->state & (TASK_STOPPED|TASK_TRACED))) { stop_count++; signal_wake_up(t, 0); } @@ -1865,9 +1856,9 @@ relock: /* Let the debugger run. */ ptrace_stop(signr, signr, info); - /* We're back. Did the debugger cancel the sig? */ + /* We're back. Did the debugger cancel the sig or group_exit? */ signr = current->exit_code; - if (signr == 0) + if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) continue; current->exit_code = 0; @@ -2228,8 +2219,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); @@ -2270,26 +2260,13 @@ sys_kill(int pid, int sig) return kill_something_info(sig, &info, pid); } -/** - * sys_tgkill - send signal to one specific thread - * @tgid: the thread group ID of the thread - * @pid: the PID of the thread - * @sig: signal to be sent - * - * This syscall also checks the tgid and returns -ESRCH even if the PID - * exists but it's not belonging to the target process anymore. This - * method solves the problem of threads exiting and PIDs getting reused. - */ -asmlinkage long sys_tgkill(int tgid, int pid, int sig) +static int do_tkill(int tgid, int pid, int sig) { - struct siginfo info; int error; + struct siginfo info; struct task_struct *p; - /* This is only valid for single tasks */ - if (pid <= 0 || tgid <= 0) - return -EINVAL; - + error = -ESRCH; info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; @@ -2298,8 +2275,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) read_lock(&tasklist_lock); p = find_task_by_pid(pid); - error = -ESRCH; - if (p && (p->tgid == tgid)) { + if (p && (tgid <= 0 || p->tgid == tgid)) { error = check_kill_permission(sig, &info, p); /* * The null signal is a permissions and process existence @@ -2313,47 +2289,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) } } read_unlock(&tasklist_lock); + return error; } +/** + * sys_tgkill - send signal to one specific thread + * @tgid: the thread group ID of the thread + * @pid: the PID of the thread + * @sig: signal to be sent + * + * This syscall also checks the tgid and returns -ESRCH even if the PID + * exists but it's not belonging to the target process anymore. This + * method solves the problem of threads exiting and PIDs getting reused. + */ +asmlinkage long sys_tgkill(int tgid, int pid, int sig) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + return do_tkill(tgid, pid, sig); +} + /* * Send a signal to only one task, even if it's a CLONE_THREAD task. */ asmlinkage long sys_tkill(int pid, int sig) { - struct siginfo info; - int error; - struct task_struct *p; - /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = current->tgid; - info.si_uid = current->uid; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - error = -ESRCH; - if (p) { - error = check_kill_permission(sig, &info, p); - /* - * The null signal is a permissions and process existence - * probe. No signal is actually delivered. - */ - if (!error && sig && p->sighand) { - spin_lock_irq(&p->sighand->siglock); - handle_stop_signal(sig, p); - error = specific_send_sig_info(sig, &info, p); - spin_unlock_irq(&p->sighand->siglock); - } - } - read_unlock(&tasklist_lock); - return error; + return do_tkill(0, pid, sig); } asmlinkage long diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..f766b2fc48be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void) cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ - local_softirq_pending() = 0; + set_softirq_pending(0); local_irq_enable(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c new file mode 100644 index 000000000000..75976209cea7 --- /dev/null +++ b/kernel/softlockup.c @@ -0,0 +1,151 @@ +/* + * Detect Soft Lockups + * + * started by Ingo Molnar, (C) 2005, Red Hat + * + * this code detects soft lockups: incidents in where on a CPU + * the kernel does not reschedule for 10 seconds or more. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/notifier.h> +#include <linux/module.h> + +static DEFINE_SPINLOCK(print_lock); + +static DEFINE_PER_CPU(unsigned long, timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(struct task_struct *, watchdog_task); + +static int did_panic = 0; +static int softlock_panic(struct notifier_block *this, unsigned long event, + void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = softlock_panic, +}; + +void touch_softlockup_watchdog(void) +{ + per_cpu(timestamp, raw_smp_processor_id()) = jiffies; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +/* + * This callback runs from the timer interrupt, and checks + * whether the watchdog thread has hung or not: + */ +void softlockup_tick(struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long timestamp = per_cpu(timestamp, this_cpu); + + if (per_cpu(print_timestamp, this_cpu) == timestamp) + return; + + /* Do not cause a second panic when there already was one */ + if (did_panic) + return; + + if (time_after(jiffies, timestamp + 10*HZ)) { + per_cpu(print_timestamp, this_cpu) = timestamp; + + spin_lock(&print_lock); + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", + this_cpu); + show_regs(regs); + spin_unlock(&print_lock); + } +} + +/* + * The watchdog thread - runs every second and touches the timestamp. + */ +static int watchdog(void * __bind_cpu) +{ + struct sched_param param = { .sched_priority = 99 }; + int this_cpu = (long) __bind_cpu; + + printk("softlockup thread %d started up.\n", this_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Run briefly once per second - if this gets delayed for + * more than 10 seconds then the debug-printout triggers + * in softlockup_tick(): + */ + while (!kthread_should_stop()) { + msleep_interruptible(1000); + touch_softlockup_watchdog(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __devinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(per_cpu(watchdog_task, hotcpu)); + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); + if (IS_ERR(p)) { + printk("watchdog for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(watchdog_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(watchdog_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + p = per_cpu(watchdog_task, hotcpu); + per_cpu(watchdog_task, hotcpu) = NULL; + kthread_stop(p); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init void spawn_softlockup_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + notifier_chain_register(&panic_notifier_list, &panic_block); +} + diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0c3f9d8bbe17..0375fcd5921d 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -3,7 +3,10 @@ * * Author: Zwane Mwaikambo <zwane@fsmlabs.com> * - * Copyright (2004) Ingo Molnar + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) */ #include <linux/config.h> @@ -17,12 +20,12 @@ * Generic declaration of the raw read_trylock() function, * architectures are supposed to optimize this: */ -int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) { - _raw_read_lock(lock); + __raw_read_lock(lock); return 1; } -EXPORT_SYMBOL(generic_raw_read_trylock); +EXPORT_SYMBOL(generic__raw_read_trylock); int __lockfunc _spin_trylock(spinlock_t *lock) { @@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock) } EXPORT_SYMBOL(_write_trylock); -#ifndef CONFIG_PREEMPT +#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) void __lockfunc _read_lock(rwlock_t *lock) { @@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) local_irq_save(flags); preempt_disable(); - _raw_spin_lock_flags(lock, flags); + _raw_spin_lock_flags(lock, &flags); return flags; } EXPORT_SYMBOL(_spin_lock_irqsave); diff --git a/kernel/sys.c b/kernel/sys.c index 000e81ad2c1d..2fa1ed18123c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -361,17 +361,35 @@ out_unlock: return retval; } +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ void emergency_restart(void) { machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart(char *cmd) +/** + * kernel_restart - reboot the system + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart_prepare(char *cmd) { notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; device_shutdown(); +} +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); if (!cmd) { printk(KERN_EMERG "Restarting system.\n"); } else { @@ -382,6 +400,12 @@ void kernel_restart(char *cmd) } EXPORT_SYMBOL_GPL(kernel_restart); +/** + * kernel_kexec - reboot the system + * + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ void kernel_kexec(void) { #ifdef CONFIG_KEXEC @@ -390,9 +414,7 @@ void kernel_kexec(void) if (!image) { return; } - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); + kernel_restart_prepare(NULL); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); machine_kexec(image); @@ -400,23 +422,39 @@ void kernel_kexec(void) } EXPORT_SYMBOL_GPL(kernel_kexec); -void kernel_halt(void) +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt_prepare(void) { notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); system_state = SYSTEM_HALT; - device_suspend(PMSG_SUSPEND); device_shutdown(); +} +void kernel_halt(void) +{ + kernel_halt_prepare(); printk(KERN_EMERG "System halted.\n"); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); -void kernel_power_off(void) +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off_prepare(void) { notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); system_state = SYSTEM_POWER_OFF; - device_suspend(PMSG_SUSPEND); device_shutdown(); +} +void kernel_power_off(void) +{ + kernel_power_off_prepare(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); } @@ -1713,7 +1751,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { long error; - int sig; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) @@ -1721,19 +1758,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, switch (option) { case PR_SET_PDEATHSIG: - sig = arg2; - if (!valid_signal(sig)) { + if (!valid_signal(arg2)) { error = -EINVAL; break; } - current->pdeath_signal = sig; + current->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(current->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - if (current->mm->dumpable) - error = 1; + error = current->mm->dumpable; break; case PR_SET_DUMPABLE: if (arg2 < 0 || arg2 > 2) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e0bbee549ea..8e56e2495542 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/net.h> #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> @@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -#ifdef CONFIG_NET -extern ctl_table net_table[]; -#endif static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; diff --git a/kernel/time.c b/kernel/time.c index dd5ae1162a8f..245d595a13cb 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc) if (mtemp >= MINSEC) { ltemp = (time_offset / mtemp) << (SHIFT_USEC - SHIFT_UPDATE); - if (ltemp < 0) - time_freq -= -ltemp >> SHIFT_KH; - else - time_freq += ltemp >> SHIFT_KH; + time_freq += shift_right(ltemp, SHIFT_KH); } else /* calibration interval too short (p. 12) */ result = TIME_ERROR; } else { /* PLL mode */ if (mtemp < MAXSEC) { ltemp *= mtemp; - if (ltemp < 0) - time_freq -= -ltemp >> (time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC); - else - time_freq += ltemp >> (time_constant + + time_freq += shift_right(ltemp,(time_constant + time_constant + - SHIFT_KF - SHIFT_USEC); + SHIFT_KF - SHIFT_USEC)); } else /* calibration interval too long (p. 12) */ result = TIME_ERROR; } - if (time_freq > time_tolerance) - time_freq = time_tolerance; - else if (time_freq < -time_tolerance) - time_freq = -time_tolerance; + time_freq = min(time_freq, time_tolerance); + time_freq = max(time_freq, -time_tolerance); } /* STA_PLL || STA_PPSTIME */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) { @@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else { - if (time_offset < 0) - txc->offset = -(-time_offset >> SHIFT_UPDATE); - else - txc->offset = time_offset >> SHIFT_UPDATE; + txc->offset = shift_right(time_offset, SHIFT_UPDATE); } txc->freq = time_freq + pps_freq; txc->maxerror = time_maxerror; @@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv) clock_was_set(); return 0; } +EXPORT_SYMBOL(do_settimeofday); void do_gettimeofday (struct timeval *tv) { @@ -570,6 +558,7 @@ void getnstimeofday(struct timespec *tv) tv->tv_sec = x.tv_sec; tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; } +EXPORT_SYMBOL_GPL(getnstimeofday); #endif #if (BITS_PER_LONG < 64) diff --git a/kernel/timer.c b/kernel/timer.c index f2a11887a726..fd74268d8663 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec); #define time_interpolator_update(x) #endif +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + /* * per-CPU timer vector definitions: */ @@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base, #endif } -static void check_timer_failed(struct timer_list *timer) -{ - static int whine_count; - if (whine_count < 16) { - whine_count++; - printk("Uninitialised timer!\n"); - printk("This is just a warning. Your computer is OK\n"); - printk("function=0x%p, data=0x%lx\n", - timer->function, timer->data); - dump_stack(); - } - /* - * Now fix it up - */ - timer->magic = TIMER_MAGIC; -} - -static inline void check_timer(struct timer_list *timer) -{ - if (timer->magic != TIMER_MAGIC) - check_timer_failed(timer); -} - - static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) { unsigned long expires = timer->expires; @@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; - timer->magic = TIMER_MAGIC; } EXPORT_SYMBOL(init_timer); @@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) int ret = 0; BUG_ON(!timer->function); - check_timer(timer); base = lock_timer_base(timer, &flags); @@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu) unsigned long flags; BUG_ON(timer_pending(timer) || !timer->function); - - check_timer(timer); - spin_lock_irqsave(&base->t_base.lock, flags); timer->base = &base->t_base; internal_add_timer(base, timer); @@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); - check_timer(timer); - /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; - check_timer(timer); - if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -412,8 +383,6 @@ out: */ int del_timer_sync(struct timer_list *timer) { - check_timer(timer); - for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -632,134 +601,118 @@ long time_next_adjust; */ static void second_overflow(void) { - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if ( time_maxerror > NTP_PHASE_LIMIT ) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at - * the end of the day, the system clock is set back one - * second; if in leap-delete state, the system clock is - * set ahead one second. The microtime() routine or - * external clock driver will insure that reported time - * is always monotonic. The ugly divides should be - * replaced. - */ - switch (time_state) { - - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - /* The timer interpolator will make time change gradually instead - * of an immediate jump by one second. - */ - time_interpolator_update(-NSEC_PER_SEC); - time_state = TIME_OOP; - clock_was_set(); - printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); + long ltemp; + + /* Bump the maxerror field */ + time_maxerror += time_tolerance >> SHIFT_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; } - break; - - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - /* Use of time interpolator for a gradual change of time */ - time_interpolator_update(NSEC_PER_SEC); - time_state = TIME_WAIT; - clock_was_set(); - printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. The microtime() + * routine or external clock driver will insure that reported time is + * always monotonic. The ugly divides should be replaced. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (xtime.tv_sec % 86400 == 0) { + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + /* + * The timer interpolator will make time change + * gradually instead of an immediate jump by one second + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; + clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if ((xtime.tv_sec + 1) % 86400 == 0) { + xtime.tv_sec++; + wall_to_monotonic.tv_sec--; + /* + * Use of time interpolator for a gradual change of + * time + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; + clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; } - break; - - case TIME_OOP: - time_state = TIME_WAIT; - break; - - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In - * PLL mode, the offset is reduced by a fixed factor - * times the time constant. In FLL mode the offset is - * used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread - * the adjustment over not more than the number of - * seconds between updates. - */ - if (time_offset < 0) { - ltemp = -time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset += ltemp; - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } else { + + /* + * Compute the phase adjustment for the next second. In PLL mode, the + * offset is reduced by a fixed factor times the time constant. In FLL + * mode the offset is used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread the adjustment + * over not more than the number of seconds between updates. + */ ltemp = time_offset; if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + ltemp = shift_right(ltemp, SHIFT_KG + time_constant); + ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); + ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); time_offset -= ltemp; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } - - /* - * Compute the frequency estimate and additional phase - * adjustment due to frequency error for the next - * second. When the PPS signal is engaged, gnaw on the - * watchdog counter and update the frequency computed by - * the pll and the PPS signal. - */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; - if (ltemp < 0) - time_adj -= -ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - else - time_adj += ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + + /* + * Compute the frequency estimate and additional phase adjustment due + * to frequency error for the next second. When the PPS signal is + * engaged, gnaw on the watchdog counter and update the frequency + * computed by the pll and the PPS signal. + */ + pps_valid++; + if (pps_valid == PPS_VALID) { /* PPS signal lost */ + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; + time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 - /* Compensate for (HZ==100) != (1 << SHIFT_HZ). - * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 2) + (-time_adj >> 5); - else - time_adj += (time_adj >> 2) + (time_adj >> 5); + /* + * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to + * get 128.125; => only 0.125% error (p. 14) + */ + time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); +#endif +#if HZ == 250 + /* + * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 255.85938; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif #if HZ == 1000 - /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). - * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 6) + (-time_adj >> 7); - else - time_adj += (time_adj >> 6) + (time_adj >> 7); + /* + * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and + * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) + */ + time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); #endif } @@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void) { long time_adjust_step, delta_nsec; - if ( (time_adjust_step = time_adjust) != 0 ) { - /* We are doing an adjtime thing. - * - * Prepare time_adjust_step to be within bounds. - * Note that a positive time_adjust means we want the clock - * to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; - - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; + if ((time_adjust_step = time_adjust) != 0 ) { + /* + * We are doing an adjtime thing. Prepare time_adjust_step to + * be within bounds. Note that a positive time_adjust means we + * want the clock to run faster. + * + * Limit the amount of the step to be in the range + * -tickadj .. +tickadj + */ + time_adjust_step = min(time_adjust_step, (long)tickadj); + time_adjust_step = max(time_adjust_step, (long)-tickadj); + + /* Reduce by this step the amount of time left */ + time_adjust -= time_adjust_step; } delta_nsec = tick_nsec + time_adjust_step * 1000; /* @@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void) * advance the tick more. */ time_phase += time_adj; - if (time_phase <= -FINENSEC) { - long ltemp = -time_phase >> (SHIFT_SCALE - 10); - time_phase += ltemp << (SHIFT_SCALE - 10); - delta_nsec -= ltemp; - } - else if (time_phase >= FINENSEC) { - long ltemp = time_phase >> (SHIFT_SCALE - 10); + if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { + long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); time_phase -= ltemp << (SHIFT_SCALE - 10); delta_nsec += ltemp; } @@ -950,6 +895,7 @@ void do_timer(struct pt_regs *regs) { jiffies_64++; update_times(); + softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1023,7 +969,7 @@ asmlinkage long sys_getppid(void) parent = me->group_leader->real_parent; for (;;) { pid = parent->tgid; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) { struct task_struct *old = parent; @@ -1127,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); + "value %lx from %p\n", timeout, + __builtin_return_address(0)); current->state = TASK_RUNNING; goto out; } @@ -1136,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; - - add_timer(&timer); + setup_timer(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire); schedule(); del_singleshot_timer_sync(&timer); @@ -1150,9 +1092,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout) out: return timeout < 0 ? 0 : timeout; } - EXPORT_SYMBOL(schedule_timeout); +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -1169,8 +1128,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); ret = 0; if (expire) { @@ -1198,8 +1156,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); ret = 0; if (expire) { @@ -1428,7 +1385,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) } } -static inline u64 time_interpolator_get_counter(void) +static inline u64 time_interpolator_get_counter(int writelock) { unsigned int src = time_interpolator->source; @@ -1442,6 +1399,15 @@ static inline u64 time_interpolator_get_counter(void) now = time_interpolator_get_cycles(src); if (lcycle && time_after(lcycle, now)) return lcycle; + + /* When holding the xtime write lock, there's no need + * to add the overhead of the cmpxchg. Readers are + * force to retry until the write lock is released. + */ + if (writelock) { + time_interpolator->last_cycle = now; + return now; + } /* Keep track of the last timer value returned. The use of cmpxchg here * will cause contention in an SMP environment. */ @@ -1455,7 +1421,7 @@ static inline u64 time_interpolator_get_counter(void) void time_interpolator_reset(void) { time_interpolator->offset = 0; - time_interpolator->last_counter = time_interpolator_get_counter(); + time_interpolator->last_counter = time_interpolator_get_counter(1); } #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) @@ -1467,7 +1433,7 @@ unsigned long time_interpolator_get_offset(void) return 0; return time_interpolator->offset + - GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); + GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); } #define INTERPOLATOR_ADJUST 65536 @@ -1482,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec) if (!time_interpolator) return; - /* The interpolator compensates for late ticks by accumulating - * the late time in time_interpolator->offset. A tick earlier than - * expected will lead to a reset of the offset and a corresponding - * jump of the clock forward. Again this only works if the - * interpolator clock is running slightly slower than the regular clock - * and the tuning logic insures that. - */ + /* + * The interpolator compensates for late ticks by accumulating the late + * time in time_interpolator->offset. A tick earlier than expected will + * lead to a reset of the offset and a corresponding jump of the clock + * forward. Again this only works if the interpolator clock is running + * slightly slower than the regular clock and the tuning logic insures + * that. + */ - counter = time_interpolator_get_counter(); - offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); + counter = time_interpolator_get_counter(1); + offset = time_interpolator->offset + + GET_TI_NSECS(counter, time_interpolator); if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) time_interpolator->offset = offset - delta_nsec; @@ -1588,10 +1556,8 @@ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout) { - set_current_state(TASK_UNINTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); @@ -1604,10 +1570,8 @@ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout && !signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 259cf55da3c9..7cee222231bc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -12,6 +12,8 @@ * Andrew Morton <andrewm@uow.edu.au> * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> + * + * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. */ #include <linux/module.h> @@ -57,7 +59,7 @@ struct cpu_workqueue_struct { * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct cpu_wq[NR_CPUS]; + struct cpu_workqueue_struct *cpu_wq; const char *name; struct list_head list; /* Empty if single thread */ }; @@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) if (unlikely(is_single_threaded(wq))) cpu = 0; BUG_ON(!list_empty(&work->entry)); - __queue_work(wq->cpu_wq + cpu, work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); ret = 1; } put_cpu(); @@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data) if (unlikely(is_single_threaded(wq))) cpu = 0; - __queue_work(wq->cpu_wq + cpu, work); + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } int fastcall queue_delayed_work(struct workqueue_struct *wq, @@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) if (is_single_threaded(wq)) { /* Always use cpu 0's area. */ - flush_cpu_workqueue(wq->cpu_wq + 0); + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0)); } else { int cpu; lock_cpu_hotplug(); for_each_online_cpu(cpu) - flush_cpu_workqueue(wq->cpu_wq + cpu); + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); unlock_cpu_hotplug(); } } @@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, int cpu) { - struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); struct task_struct *p; spin_lock_init(&cwq->lock); @@ -308,13 +310,11 @@ struct workqueue_struct *__create_workqueue(const char *name, struct workqueue_struct *wq; struct task_struct *p; - BUG_ON(strlen(name) > 10); - - wq = kmalloc(sizeof(*wq), GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; - memset(wq, 0, sizeof(*wq)); + wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); @@ -356,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) unsigned long flags; struct task_struct *p; - cwq = wq->cpu_wq + cpu; + cwq = per_cpu_ptr(wq->cpu_wq, cpu); spin_lock_irqsave(&cwq->lock, flags); p = cwq->thread; cwq->thread = NULL; @@ -383,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock(&workqueue_lock); } unlock_cpu_hotplug(); + free_percpu(wq->cpu_wq); kfree(wq); } @@ -461,7 +462,7 @@ int current_is_keventd(void) BUG_ON(!keventd_wq); - cwq = keventd_wq->cpu_wq + cpu; + cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); if (current == cwq->thread) ret = 1; @@ -473,7 +474,7 @@ int current_is_keventd(void) /* Take the work from this (downed) CPU. */ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { - struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; + struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); LIST_HEAD(list); struct work_struct *work; @@ -484,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) printk("Taking work for %s\n", wq->name); work = list_entry(list.next,struct work_struct,entry); list_del(&work->entry); - __queue_work(wq->cpu_wq + smp_processor_id(), work); + __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); } spin_unlock_irq(&cwq->lock); } @@ -501,7 +502,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (create_workqueue_thread(wq, hotcpu) < 0) { + if (!create_workqueue_thread(wq, hotcpu)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } @@ -511,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: /* Kick off worker threads. */ list_for_each_entry(wq, &workqueues, list) { - kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); - wake_up_process(wq->cpu_wq[hotcpu].thread); + struct cpu_workqueue_struct *cwq; + + cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); + kthread_bind(cwq->thread, hotcpu); + wake_up_process(cwq->thread); } break; case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { /* Unbind so it can run. */ - kthread_bind(wq->cpu_wq[hotcpu].thread, + kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, smp_processor_id()); cleanup_workqueue_thread(wq, hotcpu); } |
