diff options
| author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-04-12 02:21:00 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2004-04-12 02:21:00 -0700 |
| commit | 0d61fc5ea78015def4d2fcf9b598ecfe25210cdd (patch) | |
| tree | a45aa0f67c6bbd200dc2a328e560042810307b1b /kernel | |
| parent | f2eb250f07ba4695c2474cb8b6edbf64b5457d65 (diff) | |
| parent | eb880e5457f8b4a61ff7fd36d47dd14fe51cb030 (diff) | |
Merge bk://bk.arm.linux.org.uk/linux-2.6-rmk
into ppc970.osdl.org:/home/torvalds/v2.6/linux
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 2 | ||||
| -rw-r--r-- | kernel/acct.c | 8 | ||||
| -rw-r--r-- | kernel/audit.c | 825 | ||||
| -rw-r--r-- | kernel/auditsc.c | 922 | ||||
| -rw-r--r-- | kernel/exit.c | 24 | ||||
| -rw-r--r-- | kernel/fork.c | 38 | ||||
| -rw-r--r-- | kernel/kmod.c | 2 | ||||
| -rw-r--r-- | kernel/module.c | 4 | ||||
| -rw-r--r-- | kernel/params.c | 7 | ||||
| -rw-r--r-- | kernel/pid.c | 8 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 97 | ||||
| -rw-r--r-- | kernel/power/Kconfig | 8 | ||||
| -rw-r--r-- | kernel/power/disk.c | 8 | ||||
| -rw-r--r-- | kernel/power/main.c | 7 | ||||
| -rw-r--r-- | kernel/power/pmdisk.c | 5 | ||||
| -rw-r--r-- | kernel/power/process.c | 21 | ||||
| -rw-r--r-- | kernel/power/swsusp.c | 282 | ||||
| -rw-r--r-- | kernel/printk.c | 3 | ||||
| -rw-r--r-- | kernel/sched.c | 44 | ||||
| -rw-r--r-- | kernel/signal.c | 8 | ||||
| -rw-r--r-- | kernel/softirq.c | 70 | ||||
| -rw-r--r-- | kernel/stop_machine.c | 4 | ||||
| -rw-r--r-- | kernel/sys.c | 40 | ||||
| -rw-r--r-- | kernel/sysctl.c | 28 | ||||
| -rw-r--r-- | kernel/time.c | 7 | ||||
| -rw-r--r-- | kernel/timer.c | 4 |
26 files changed, 2232 insertions, 244 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 3a6484838748..238c65f60d9e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,8 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index 9dbab88b2d31..555e1e3c349f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -347,7 +347,11 @@ static void do_acct_process(long exitcode, struct file *file) /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; - ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0; + + read_lock(&tasklist_lock); /* pin current->signal */ + ac.ac_tty = current->signal->tty ? + old_encode_dev(tty_devnum(current->signal->tty)) : 0; + read_unlock(&tasklist_lock); ac.ac_flag = 0; if (current->flags & PF_FORKNOEXEC) @@ -376,7 +380,7 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_minflt = encode_comp_t(current->min_flt); ac.ac_majflt = encode_comp_t(current->maj_flt); - ac.ac_swaps = encode_comp_t(current->nswap); + ac.ac_swaps = encode_comp_t(0); ac.ac_exitcode = exitcode; /* diff --git a/kernel/audit.c b/kernel/audit.c new file mode 100644 index 000000000000..765822b03b91 --- /dev/null +++ b/kernel/audit.c @@ -0,0 +1,825 @@ +/* audit.c -- Auditing support -*- linux-c -*- + * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. + * System-call specific features have moved to auditsc.c + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Written by Rickard E. (Rik) Faith <faith@redhat.com> + * + * Goals: 1) Integrate fully with SELinux. + * 2) Minimal run-time overhead: + * a) Minimal when syscall auditing is disabled (audit_enable=0). + * b) Small when syscall auditing is enabled and no audit record + * is generated (defer as much work as possible to record + * generation time): + * i) context is allocated, + * ii) names from getname are stored without a copy, and + * iii) inode information stored from path_lookup. + * 3) Ability to disable syscall auditing at boot time (audit=0). + * 4) Usable by other parts of the kernel (if audit_log* is called, + * then a syscall record will be generated automatically for the + * current syscall). + * 5) Netlink interface to user-space. + * 6) Support low-overhead kernel-based filtering to minimize the + * information that must be passed to user-space. + * + * Example user-space utilities: http://people.redhat.com/faith/audit/ + */ + +#include <linux/init.h> +#include <asm/atomic.h> +#include <asm/types.h> +#include <linux/mm.h> +#include <linux/module.h> + +#include <linux/audit.h> + +#include <net/sock.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> + +/* No auditing will take place until audit_initialized != 0. + * (Initialization happens after skb_init is called.) */ +static int audit_initialized; + +/* No syscall auditing will take place unless audit_enabled != 0. */ +int audit_enabled; + +/* Default state when kernel boots without any parameters. */ +static int audit_default; + +/* If auditing cannot proceed, audit_failure selects what happens. */ +static int audit_failure = AUDIT_FAIL_PRINTK; + +/* If audit records are to be written to the netlink socket, audit_pid + * contains the (non-zero) pid. */ +static int audit_pid; + +/* If audit_limit is non-zero, limit the rate of sending audit records + * to that number per second. This prevents DoS attacks, but results in + * audit records being dropped. */ +static int audit_rate_limit; + +/* Number of outstanding audit_buffers allowed. */ +static int audit_backlog_limit = 64; +static atomic_t audit_backlog = ATOMIC_INIT(0); + +/* Records can be lost in several ways: + 0) [suppressed in audit_alloc] + 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] + 2) out of memory in audit_log_move [alloc_skb] + 3) suppressed due to audit_rate_limit + 4) suppressed due to audit_backlog_limit +*/ +static atomic_t audit_lost = ATOMIC_INIT(0); + +/* The netlink socket. */ +static struct sock *audit_sock; + +/* There are two lists of audit buffers. The txlist contains audit + * buffers that cannot be sent immediately to the netlink device because + * we are in an irq context (these are sent later in a tasklet). + * + * The second list is a list of pre-allocated audit buffers (if more + * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of + * being placed on the freelist). */ +static spinlock_t audit_txlist_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t audit_freelist_lock = SPIN_LOCK_UNLOCKED; +static int audit_freelist_count = 0; +static LIST_HEAD(audit_txlist); +static LIST_HEAD(audit_freelist); + +/* There are three lists of rules -- one to search at task creation + * time, one to search at syscall entry time, and another to search at + * syscall exit time. */ +static LIST_HEAD(audit_tsklist); +static LIST_HEAD(audit_entlist); +static LIST_HEAD(audit_extlist); + +/* The netlink socket is only to be read by 1 CPU, which lets us assume + * that list additions and deletions never happen simultaneiously in + * auditsc.c */ +static DECLARE_MUTEX(audit_netlink_sem); + +/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting + * audit records. Since printk uses a 1024 byte buffer, this buffer + * should be at least that large. */ +#define AUDIT_BUFSIZ 1024 + +/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the + * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ +#define AUDIT_MAXFREE (2*NR_CPUS) + +/* The audit_buffer is used when formatting an audit record. The caller + * locks briefly to get the record off the freelist or to allocate the + * buffer, and locks briefly to send the buffer to the netlink layer or + * to place it on a transmit queue. Multiple audit_buffers can be in + * use simultaneously. */ +struct audit_buffer { + struct list_head list; + struct sk_buff_head sklist; /* formatted skbs ready to send */ + struct audit_context *ctx; /* NULL or associated context */ + int len; /* used area of tmp */ + char tmp[AUDIT_BUFSIZ]; + + /* Pointer to header and contents */ + struct nlmsghdr *nlh; + int total; + int type; + int pid; + int count; /* Times requeued */ +}; + +struct audit_entry { + struct list_head list; + struct audit_rule rule; +}; + +static void audit_panic(const char *message) +{ + switch (audit_failure) + { + case AUDIT_FAIL_SILENT: + break; + case AUDIT_FAIL_PRINTK: + printk(KERN_ERR "audit: %s\n", message); + break; + case AUDIT_FAIL_PANIC: + panic(message); + break; + } +} + +static inline int audit_rate_check(void) +{ + static unsigned long last_check = 0; + static int messages = 0; + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + unsigned long now; + unsigned long elapsed; + int retval = 0; + + if (!audit_rate_limit) return 1; + + spin_lock_irqsave(&lock, flags); + if (++messages < audit_rate_limit) { + retval = 1; + } else { + now = jiffies; + elapsed = now - last_check; + if (elapsed > HZ) { + last_check = now; + messages = 0; + retval = 1; + } + } + spin_unlock_irqrestore(&lock, flags); + + return retval; +} + +/* Emit at least 1 message per second, even if audit_rate_check is + * throttling. */ +void audit_log_lost(const char *message) +{ + static unsigned long last_msg = 0; + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + unsigned long now; + int print; + + atomic_inc(&audit_lost); + + print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); + + if (!print) { + spin_lock_irqsave(&lock, flags); + now = jiffies; + if (now - last_msg > HZ) { + print = 1; + last_msg = now; + } + spin_unlock_irqrestore(&lock, flags); + } + + if (print) { + printk(KERN_WARNING + "audit: audit_lost=%d audit_backlog=%d" + " audit_rate_limit=%d audit_backlog_limit=%d\n", + atomic_read(&audit_lost), + atomic_read(&audit_backlog), + audit_rate_limit, + audit_backlog_limit); + audit_panic(message); + } + +} + +int audit_set_rate_limit(int limit) +{ + int old = audit_rate_limit; + audit_rate_limit = limit; + audit_log(current->audit_context, "audit_rate_limit=%d old=%d", + audit_rate_limit, old); + return old; +} + +int audit_set_backlog_limit(int limit) +{ + int old = audit_backlog_limit; + audit_backlog_limit = limit; + audit_log(current->audit_context, "audit_backlog_limit=%d old=%d", + audit_backlog_limit, old); + return old; +} + +int audit_set_enabled(int state) +{ + int old = audit_enabled; + if (state != 0 && state != 1) + return -EINVAL; + audit_enabled = state; + audit_log(current->audit_context, "audit_enabled=%d old=%d", + audit_enabled, old); + return old; +} + +int audit_set_failure(int state) +{ + int old = audit_failure; + if (state != AUDIT_FAIL_SILENT + && state != AUDIT_FAIL_PRINTK + && state != AUDIT_FAIL_PANIC) + return -EINVAL; + audit_failure = state; + audit_log(current->audit_context, "audit_failure=%d old=%d", + audit_failure, old); + return old; +} + +#ifdef CONFIG_NET +void audit_send_reply(int pid, int seq, int type, int done, int multi, + void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len = NLMSG_SPACE(size); + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + goto nlmsg_failure; + + nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); + nlh->nlmsg_flags = flags; + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); + return; + +nlmsg_failure: /* Used by NLMSG_PUT */ + if (skb) + kfree_skb(skb); +} + +static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + u32 uid, pid, seq; + void *data; + struct audit_status *status_get, status_set; + struct audit_login *login; + int err = 0; + struct audit_buffer *ab; + + pid = NETLINK_CREDS(skb)->pid; + uid = NETLINK_CREDS(skb)->uid; + seq = nlh->nlmsg_seq; + data = NLMSG_DATA(nlh); + + switch (nlh->nlmsg_type) { + case AUDIT_GET: + status_set.enabled = audit_enabled; + status_set.failure = audit_failure; + status_set.pid = audit_pid; + status_set.rate_limit = audit_rate_limit; + status_set.backlog_limit = audit_backlog_limit; + status_set.lost = atomic_read(&audit_lost); + status_set.backlog = atomic_read(&audit_backlog); + audit_send_reply(pid, seq, AUDIT_GET, 0, 0, + &status_set, sizeof(status_set)); + break; + case AUDIT_SET: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + status_get = (struct audit_status *)data; + if (status_get->mask & AUDIT_STATUS_ENABLED) { + err = audit_set_enabled(status_get->enabled); + if (err < 0) return err; + } + if (status_get->mask & AUDIT_STATUS_FAILURE) { + err = audit_set_failure(status_get->failure); + if (err < 0) return err; + } + if (status_get->mask & AUDIT_STATUS_PID) { + int old = audit_pid; + audit_pid = status_get->pid; + audit_log(current->audit_context, + "audit_pid=%d old=%d", audit_pid, old); + } + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) + audit_set_rate_limit(status_get->rate_limit); + if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) + audit_set_backlog_limit(status_get->backlog_limit); + break; + case AUDIT_USER: + ab = audit_log_start(NULL); + if (!ab) + break; /* audit_panic has been called */ + audit_log_format(ab, + "user pid=%d uid=%d length=%d msg='%.1024s'", + pid, uid, + (int)(nlh->nlmsg_len + - ((char *)data - (char *)nlh)), + (char *)data); + ab->type = AUDIT_USER; + ab->pid = pid; + audit_log_end(ab); + break; + case AUDIT_LOGIN: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + login = (struct audit_login *)data; + ab = audit_log_start(NULL); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%d loginuid=%d" + " length=%d msg='%.1024s'", + pid, uid, + login->loginuid, + login->msglen, + login->msg); + ab->type = AUDIT_LOGIN; + ab->pid = pid; + audit_log_end(ab); + } +#ifdef CONFIG_AUDITSYSCALL + err = audit_set_loginuid(current->audit_context, + login->loginuid); +#endif + break; + case AUDIT_LIST: + case AUDIT_ADD: + case AUDIT_DEL: +#ifdef CONFIG_AUDITSYSCALL + err = audit_receive_filter(nlh->nlmsg_type, pid, uid, seq, + data); +#else + err = -EOPNOTSUPP; +#endif + break; + default: + err = -EINVAL; + break; + } + + return err < 0 ? err : 0; +} + +/* Get message from skb (based on rtnetlink_rcv_skb). Each message is + * processed by audit_receive_msg. Malformed skbs with wrong length are + * discarded silently. */ +static int audit_receive_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + u32 rlen; + + while (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if ((err = audit_receive_msg(skb, nlh))) { + netlink_ack(skb, nlh, -err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + return 0; +} + +/* Receive messages from netlink socket. */ +static void audit_receive(struct sock *sk, int length) +{ + struct sk_buff *skb; + + if (down_trylock(&audit_netlink_sem)) + return; + + /* FIXME: this must not cause starvation */ + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + if (audit_receive_skb(skb) && skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else + kfree_skb(skb); + } + up(&audit_netlink_sem); +} + +/* Move data from tmp buffer into an skb. This is an extra copy, and + * that is unfortunate. However, the copy will only occur when a record + * is being written to user space, which is already a high-overhead + * operation. (Elimination of the copy is possible, for example, by + * writing directly into a pre-allocated skb, at the cost of wasting + * memory. */ +static void audit_log_move(struct audit_buffer *ab) +{ + struct sk_buff *skb; + char *start; + int extra = ab->nlh ? 0 : NLMSG_SPACE(0); + + skb = skb_peek(&ab->sklist); + if (!skb || skb_tailroom(skb) <= ab->len + extra) { + skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); + if (!skb) { + ab->len = 0; /* Lose information in ab->tmp */ + audit_log_lost("out of memory in audit_log_move"); + return; + } + __skb_queue_tail(&ab->sklist, skb); + if (!ab->nlh) + ab->nlh = (struct nlmsghdr *)skb_put(skb, + NLMSG_SPACE(0)); + } + start = skb_put(skb, ab->len); + memcpy(start, ab->tmp, ab->len); + ab->len = 0; +} + +/* Iterate over the skbuff in the audit_buffer, sending their contents + * to user space. */ +static inline int audit_log_drain(struct audit_buffer *ab) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&ab->sklist))) { + int retval = 0; + + if (audit_pid) { + if (ab->nlh) { + ab->nlh->nlmsg_len = ab->total; + ab->nlh->nlmsg_type = ab->type; + ab->nlh->nlmsg_flags = 0; + ab->nlh->nlmsg_seq = 0; + ab->nlh->nlmsg_pid = ab->pid; + } + skb_get(skb); /* because netlink_* frees */ + retval = netlink_unicast(audit_sock, skb, audit_pid, + MSG_DONTWAIT); + } + if (retval == -EAGAIN && ab->count < 5) { + ++ab->count; + audit_log_end_irq(ab); + return 1; + } + if (retval < 0) { + if (retval == -ECONNREFUSED) { + printk(KERN_ERR + "audit: *NO* daemon at audit_pid=%d\n", + audit_pid); + audit_pid = 0; + } else + audit_log_lost("netlink socket too busy"); + } + if (!audit_pid) { /* No daemon */ + int offset = ab->nlh ? NLMSG_SPACE(0) : 0; + int len = skb->len - offset; + printk(KERN_ERR "%*.*s\n", + len, len, skb->data + offset); + } + kfree_skb(skb); + ab->nlh = NULL; + } + return 0; +} + +/* Initialize audit support at boot time. */ +int __init audit_init(void) +{ + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", + audit_default ? "enabled" : "disabled"); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + if (!audit_sock) + audit_panic("cannot initialize netlink socket"); + + audit_initialized = 1; + audit_enabled = audit_default; + audit_log(NULL, "initialized"); + return 0; +} + +#else +/* Without CONFIG_NET, we have no skbuffs. For now, print what we have + * in the buffer. */ +static void audit_log_move(struct audit_buffer *ab) +{ + printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp); + ab->len = 0; +} + +static inline int audit_log_drain(struct audit_buffer *ab) +{ + return 0; +} + +/* Initialize audit support at boot time. */ +int __init audit_init(void) +{ + printk(KERN_INFO "audit: initializing WITHOUT netlink support\n"); + audit_sock = NULL; + audit_pid = 0; + + audit_initialized = 1; + audit_enabled = audit_default; + audit_log(NULL, "initialized"); + return 0; +} +#endif + +__initcall(audit_init); + +/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ +static int __init audit_enable(char *str) +{ + audit_default = !!simple_strtol(str, NULL, 0); + printk(KERN_INFO "audit: %s%s\n", + audit_default ? "enabled" : "disabled", + audit_initialized ? "" : " (after initialization)"); + if (audit_initialized) + audit_enabled = audit_default; + return 0; +} + +__setup("audit=", audit_enable); + + +/* Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the tsk is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, tsk + * should be NULL. */ +struct audit_buffer *audit_log_start(struct audit_context *ctx) +{ + struct audit_buffer *ab = NULL; + unsigned long flags; + struct timespec t; + int serial = 0; + + if (!audit_initialized) + return NULL; + + if (audit_backlog_limit + && atomic_read(&audit_backlog) > audit_backlog_limit) { + if (audit_rate_check()) + printk(KERN_WARNING + "audit: audit_backlog=%d > " + "audit_backlog_limit=%d\n", + atomic_read(&audit_backlog), + audit_backlog_limit); + audit_log_lost("backlog limit exceeded"); + return NULL; + } + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (!list_empty(&audit_freelist)) { + ab = list_entry(audit_freelist.next, + struct audit_buffer, list); + list_del(&ab->list); + --audit_freelist_count; + } + spin_unlock_irqrestore(&audit_freelist_lock, flags); + + if (!ab) + ab = kmalloc(sizeof(*ab), GFP_ATOMIC); + if (!ab) + audit_log_lost("audit: out of memory in audit_log_start"); + if (!ab) + return NULL; + + atomic_inc(&audit_backlog); + skb_queue_head_init(&ab->sklist); + + ab->ctx = ctx; + ab->len = 0; + ab->nlh = NULL; + ab->total = 0; + ab->type = AUDIT_KERNEL; + ab->pid = 0; + ab->count = 0; + +#ifdef CONFIG_AUDITSYSCALL + if (ab->ctx) + audit_get_stamp(ab->ctx, &t, &serial); + else +#endif + t = CURRENT_TIME; + + audit_log_format(ab, "audit(%lu.%03lu:%u): ", + t.tv_sec, t.tv_nsec/1000000, serial); + return ab; +} + + +/* Format an audit message into the audit buffer. If there isn't enough + * room in the audit buffer, more room will be allocated and vsnprint + * will be called a second time. Currently, we assume that a printk + * can't format message larger than 1024 bytes, so we don't either. */ +static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, + va_list args) +{ + int len, avail; + + if (!ab) + return; + + avail = sizeof(ab->tmp) - ab->len; + if (avail <= 0) { + audit_log_move(ab); + avail = sizeof(ab->tmp) - ab->len; + } + len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + if (len >= avail) { + /* The printk buffer is 1024 bytes long, so if we get + * here and AUDIT_BUFSIZ is at least 1024, then we can + * log everything that printk could have logged. */ + audit_log_move(ab); + avail = sizeof(ab->tmp) - ab->len; + len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + } + ab->len += (len < avail) ? len : avail; + ab->total += (len < avail) ? len : avail; +} + +/* Format a message into the audit buffer. All the work is done in + * audit_log_vformat. */ +void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) +{ + va_list args; + + if (!ab) + return; + va_start(args, fmt); + audit_log_vformat(ab, fmt, args); + va_end(args); +} + +/* This is a helper-function to print the d_path without using a static + * buffer or allocating another buffer in addition to the one in + * audit_buffer. */ +void audit_log_d_path(struct audit_buffer *ab, const char *prefix, + struct dentry *dentry, struct vfsmount *vfsmnt) +{ + char *p; + int len, avail; + + if (prefix) audit_log_format(ab, " %s", prefix); + + if (ab->len > 128) + audit_log_move(ab); + avail = sizeof(ab->tmp) - ab->len; + p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail); + if (p == ERR_PTR(-ENAMETOOLONG)) { + /* FIXME: can we save some information here? */ + audit_log_format(ab, "<toolong>"); + } else { + /* path isn't at start of buffer */ + len = (ab->tmp + sizeof(ab->tmp) - 1) - p; + memmove(ab->tmp + ab->len, p, len); + ab->len += len; + ab->total += len; + } +} + +/* Remove queued messages from the audit_txlist and send them to userspace. */ +static void audit_tasklet_handler(unsigned long arg) +{ + LIST_HEAD(list); + struct audit_buffer *ab; + unsigned long flags; + + spin_lock_irqsave(&audit_txlist_lock, flags); + list_splice_init(&audit_txlist, &list); + spin_unlock_irqrestore(&audit_txlist_lock, flags); + + while (!list_empty(&list)) { + ab = list_entry(list.next, struct audit_buffer, list); + list_del(&ab->list); + audit_log_end_fast(ab); + } +} + +static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0); + +/* The netlink_* functions cannot be called inside an irq context, so + * the audit buffer is places on a queue and a tasklet is scheduled to + * remove them from the queue outside the irq context. May be called in + * any context. */ +void audit_log_end_irq(struct audit_buffer *ab) +{ + unsigned long flags; + + if (!ab) + return; + spin_lock_irqsave(&audit_txlist_lock, flags); + list_add_tail(&ab->list, &audit_txlist); + spin_unlock_irqrestore(&audit_txlist_lock, flags); + + tasklet_schedule(&audit_tasklet); +} + +/* Send the message in the audit buffer directly to user space. May not + * be called in an irq context. */ +void audit_log_end_fast(struct audit_buffer *ab) +{ + unsigned long flags; + + BUG_ON(in_irq()); + if (!ab) + return; + if (!audit_rate_check()) { + audit_log_lost("rate limit exceeded"); + } else { + audit_log_move(ab); + if (audit_log_drain(ab)) + return; + } + + atomic_dec(&audit_backlog); + spin_lock_irqsave(&audit_freelist_lock, flags); + if (++audit_freelist_count > AUDIT_MAXFREE) + kfree(ab); + else + list_add(&ab->list, &audit_freelist); + spin_unlock_irqrestore(&audit_freelist_lock, flags); +} + +/* Send or queue the message in the audit buffer, depending on the + * current context. (A convenience function that may be called in any + * context.) */ +void audit_log_end(struct audit_buffer *ab) +{ + if (in_irq()) + audit_log_end_irq(ab); + else + audit_log_end_fast(ab); +} + +/* Log an audit record. This is a convenience function that calls + * audit_log_start, audit_log_vformat, and audit_log_end. It may be + * called in any context. */ +void audit_log(struct audit_context *ctx, const char *fmt, ...) +{ + struct audit_buffer *ab; + va_list args; + + ab = audit_log_start(ctx); + if (ab) { + va_start(args, fmt); + audit_log_vformat(ab, fmt, args); + va_end(args); + audit_log_end(ab); + } +} + +EXPORT_SYMBOL_GPL(audit_set_rate_limit); +EXPORT_SYMBOL_GPL(audit_set_backlog_limit); +EXPORT_SYMBOL_GPL(audit_set_enabled); +EXPORT_SYMBOL_GPL(audit_set_failure); + +EXPORT_SYMBOL_GPL(audit_log_start); +EXPORT_SYMBOL_GPL(audit_log_format); +EXPORT_SYMBOL_GPL(audit_log_end_irq); +EXPORT_SYMBOL_GPL(audit_log_end_fast); +EXPORT_SYMBOL_GPL(audit_log_end); +EXPORT_SYMBOL_GPL(audit_log); +EXPORT_SYMBOL_GPL(audit_log_d_path); diff --git a/kernel/auditsc.c b/kernel/auditsc.c new file mode 100644 index 000000000000..342b57141fd9 --- /dev/null +++ b/kernel/auditsc.c @@ -0,0 +1,922 @@ +/* auditsc.c -- System-call auditing support -*- linux-c -*- + * Handles all system-call specific auditing features. + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Written by Rickard E. (Rik) Faith <faith@redhat.com> + * + * Many of the ideas implemented here are from Stephen C. Tweedie, + * especially the idea of avoiding a copy by using getname. + * + * The method for actual interception of syscall entry and exit (not in + * this file -- see entry.S) is based on a GPL'd patch written by + * okir@suse.de and Copyright 2003 SuSE Linux AG. + * + */ + +#include <linux/init.h> +#include <asm/atomic.h> +#include <asm/types.h> +#include <linux/mm.h> +#include <linux/module.h> + +#include <linux/audit.h> +#include <linux/personality.h> +#include <linux/time.h> +#include <asm/unistd.h> + +/* 0 = no checking + 1 = put_count checking + 2 = verbose put_count checking +*/ +#define AUDIT_DEBUG 0 + +/* No syscall auditing will take place unless audit_enabled != 0. */ +extern int audit_enabled; + +/* AUDIT_NAMES is the number of slots we reserve in the audit_context + * for saving names from getname(). */ +#define AUDIT_NAMES 20 + +/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the + * audit_context from being used for nameless inodes from + * path_lookup. */ +#define AUDIT_NAMES_RESERVED 7 + +/* At task start time, the audit_state is set in the audit_context using + a per-task filter. At syscall entry, the audit_state is augmented by + the syscall filter. */ +enum audit_state { + AUDIT_DISABLED, /* Do not create per-task audit_context. + * No syscall-specific audit records can + * be generated. */ + AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, + * but don't necessarily fill it in at + * syscall entry time (i.e., filter + * instead). */ + AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, + * and always fill it in at syscall + * entry time. This makes a full + * syscall record available if some + * other part of the kernel decides it + * should be recorded. */ + AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, + * always fill it in at syscall entry + * time, and always write out the audit + * record at syscall exit time. */ +}; + +/* When fs/namei.c:getname() is called, we store the pointer in name and + * we don't let putname() free it (instead we free all of the saved + * pointers at syscall exit time). + * + * Further, in fs/namei.c:path_lookup() we store the inode and device. */ +struct audit_names { + const char *name; + unsigned long ino; + dev_t rdev; +}; + +/* The per-task audit context. */ +struct audit_context { + int in_syscall; /* 1 if task is in a syscall */ + enum audit_state state; + unsigned int serial; /* serial number for record */ + struct timespec ctime; /* time of syscall entry */ + uid_t loginuid; /* login uid (identity) */ + int major; /* syscall number */ + unsigned long argv[4]; /* syscall arguments */ + int return_valid; /* return code is valid */ + int return_code;/* syscall return code */ + int auditable; /* 1 if record should be written */ + int name_count; + struct audit_names names[AUDIT_NAMES]; + struct audit_context *previous; /* For nested syscalls */ + + /* Save things to print about task_struct */ + pid_t pid; + uid_t uid, euid, suid, fsuid; + gid_t gid, egid, sgid, fsgid; + unsigned long personality; + +#if AUDIT_DEBUG + int put_count; + int ino_count; +#endif +}; + + /* Public API */ +/* There are three lists of rules -- one to search at task creation + * time, one to search at syscall entry time, and another to search at + * syscall exit time. */ +static LIST_HEAD(audit_tsklist); +static LIST_HEAD(audit_entlist); +static LIST_HEAD(audit_extlist); + +struct audit_entry { + struct list_head list; + struct rcu_head rcu; + struct audit_rule rule; +}; + +/* Check to see if two rules are identical. It is called from + * audit_del_rule during AUDIT_DEL. */ +static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +{ + int i; + + if (a->flags != b->flags) + return 1; + + if (a->action != b->action) + return 1; + + if (a->field_count != b->field_count) + return 1; + + for (i = 0; i < a->field_count; i++) { + if (a->fields[i] != b->fields[i] + || a->values[i] != b->values[i]) + return 1; + } + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (a->mask[i] != b->mask[i]) + return 1; + + return 0; +} + +/* Note that audit_add_rule and audit_del_rule are called via + * audit_receive() in audit.c, and are protected by + * audit_netlink_sem. */ +static inline int audit_add_rule(struct audit_entry *entry, + struct list_head *list) +{ + if (entry->rule.flags & AUDIT_PREPEND) { + entry->rule.flags &= ~AUDIT_PREPEND; + list_add_rcu(&entry->list, list); + } else { + list_add_tail_rcu(&entry->list, list); + } + return 0; +} + +static void audit_free_rule(void *arg) +{ + kfree(arg); +} + +/* Note that audit_add_rule and audit_del_rule are called via + * audit_receive() in audit.c, and are protected by + * audit_netlink_sem. */ +static inline int audit_del_rule(struct audit_rule *rule, + struct list_head *list) +{ + struct audit_entry *e; + + /* Do not use the _rcu iterator here, since this is the only + * deletion routine. */ + list_for_each_entry(e, list, list) { + if (!audit_compare_rule(rule, &e->rule)) { + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule, e); + return 0; + } + } + return -EFAULT; /* No matching rule */ +} + +#ifdef CONFIG_NET +/* Copy rule from user-space to kernel-space. Called during + * AUDIT_ADD. */ +static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +{ + int i; + + if (s->action != AUDIT_NEVER + && s->action != AUDIT_POSSIBLE + && s->action != AUDIT_ALWAYS) + return -1; + if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) + return -1; + + d->flags = s->flags; + d->action = s->action; + d->field_count = s->field_count; + for (i = 0; i < d->field_count; i++) { + d->fields[i] = s->fields[i]; + d->values[i] = s->values[i]; + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + return 0; +} + +int audit_receive_filter(int type, int pid, int uid, int seq, void *data) +{ + u32 flags; + struct audit_entry *entry; + int err = 0; + + switch (type) { + case AUDIT_LIST: + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_sem held. */ + list_for_each_entry(entry, &audit_tsklist, list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); + list_for_each_entry(entry, &audit_entlist, list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); + list_for_each_entry(entry, &audit_extlist, list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); + audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + break; + case AUDIT_ADD: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) + return -ENOMEM; + if (audit_copy_rule(&entry->rule, data)) { + kfree(entry); + return -EINVAL; + } + flags = entry->rule.flags; + if (!err && (flags & AUDIT_PER_TASK)) + err = audit_add_rule(entry, &audit_tsklist); + if (!err && (flags & AUDIT_AT_ENTRY)) + err = audit_add_rule(entry, &audit_entlist); + if (!err && (flags & AUDIT_AT_EXIT)) + err = audit_add_rule(entry, &audit_extlist); + break; + case AUDIT_DEL: + flags =((struct audit_rule *)data)->flags; + if (!err && (flags & AUDIT_PER_TASK)) + err = audit_del_rule(data, &audit_tsklist); + if (!err && (flags & AUDIT_AT_ENTRY)) + err = audit_del_rule(data, &audit_entlist); + if (!err && (flags & AUDIT_AT_EXIT)) + err = audit_del_rule(data, &audit_extlist); + break; + default: + return -EINVAL; + } + + return err; +} +#endif + +/* Compare a task_struct with an audit_rule. Return 1 on match, 0 + * otherwise. */ +static int audit_filter_rules(struct task_struct *tsk, + struct audit_rule *rule, + struct audit_context *ctx, + enum audit_state *state) +{ + int i, j; + + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_NEGATE; + u32 value = rule->values[i]; + int result = 0; + + switch (field) { + case AUDIT_PID: + result = (tsk->pid == value); + break; + case AUDIT_UID: + result = (tsk->uid == value); + break; + case AUDIT_EUID: + result = (tsk->euid == value); + break; + case AUDIT_SUID: + result = (tsk->suid == value); + break; + case AUDIT_FSUID: + result = (tsk->fsuid == value); + break; + case AUDIT_GID: + result = (tsk->gid == value); + break; + case AUDIT_EGID: + result = (tsk->egid == value); + break; + case AUDIT_SGID: + result = (tsk->sgid == value); + break; + case AUDIT_FSGID: + result = (tsk->fsgid == value); + break; + case AUDIT_PERS: + result = (tsk->personality == value); + break; + + case AUDIT_EXIT: + if (ctx && ctx->return_valid) + result = (ctx->return_code == value); + break; + case AUDIT_SUCCESS: + if (ctx && ctx->return_valid) + result = (ctx->return_code >= 0); + break; + case AUDIT_DEVMAJOR: + if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (MAJOR(ctx->names[j].rdev)==value) { + ++result; + break; + } + } + } + break; + case AUDIT_DEVMINOR: + if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (MINOR(ctx->names[j].rdev)==value) { + ++result; + break; + } + } + } + break; + case AUDIT_INODE: + if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (MINOR(ctx->names[j].ino)==value) { + ++result; + break; + } + } + } + break; + case AUDIT_LOGINUID: + result = 0; + if (ctx) + result = (ctx->loginuid == value); + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + if (ctx) + result = (ctx->argv[field-AUDIT_ARG0]==value); + break; + } + + if (rule->fields[i] & AUDIT_NEGATE) + result = !result; + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +/* At process creation time, we can determine if system-call auditing is + * completely disabled for this task. Since we only have the task + * structure at this point, we can only check uid and gid. + */ +static enum audit_state audit_filter_task(struct task_struct *tsk) +{ + struct audit_entry *e; + enum audit_state state; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &audit_tsklist, list) { + if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { + rcu_read_unlock(); + return state; + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* At syscall entry and exit time, this filter is called if the + * audit_state is not low enough that auditing cannot take place, but is + * also not high enough that we already know we have to write and audit + * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). + */ +static enum audit_state audit_filter_syscall(struct task_struct *tsk, + struct audit_context *ctx, + struct list_head *list) +{ + struct audit_entry *e; + enum audit_state state; + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + rcu_read_lock(); + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit + && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + rcu_read_unlock(); + return state; + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* This should be called with task_lock() held. */ +static inline struct audit_context *audit_get_context(struct task_struct *tsk, + int return_valid, + int return_code) +{ + struct audit_context *context = tsk->audit_context; + + if (likely(!context)) + return NULL; + context->return_valid = return_valid; + context->return_code = return_code; + + if (context->in_syscall && !context->auditable) { + enum audit_state state; + state = audit_filter_syscall(tsk, context, &audit_extlist); + if (state == AUDIT_RECORD_CONTEXT) + context->auditable = 1; + } + + context->pid = tsk->pid; + context->uid = tsk->uid; + context->gid = tsk->gid; + context->euid = tsk->euid; + context->suid = tsk->suid; + context->fsuid = tsk->fsuid; + context->egid = tsk->egid; + context->sgid = tsk->sgid; + context->fsgid = tsk->fsgid; + context->personality = tsk->personality; + tsk->audit_context = NULL; + return context; +} + +static inline void audit_free_names(struct audit_context *context) +{ + int i; + +#if AUDIT_DEBUG == 2 + if (context->auditable + ||context->put_count + context->ino_count != context->name_count) { + printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" + " name_count=%d put_count=%d" + " ino_count=%d [NOT freeing]\n", + __LINE__, + context->serial, context->major, context->in_syscall, + context->name_count, context->put_count, + context->ino_count); + for (i = 0; i < context->name_count; i++) + printk(KERN_ERR "names[%d] = %p = %s\n", i, + context->names[i].name, + context->names[i].name); + dump_stack(); + return; + } +#endif +#if AUDIT_DEBUG + context->put_count = 0; + context->ino_count = 0; +#endif + + for (i = 0; i < context->name_count; i++) + if (context->names[i].name) + __putname(context->names[i].name); + context->name_count = 0; +} + +static inline void audit_zero_context(struct audit_context *context, + enum audit_state state) +{ + uid_t loginuid = context->loginuid; + + memset(context, 0, sizeof(*context)); + context->state = state; + context->loginuid = loginuid; +} + +static inline struct audit_context *audit_alloc_context(enum audit_state state) +{ + struct audit_context *context; + + if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + return NULL; + audit_zero_context(context, state); + return context; +} + +/* Filter on the task information and allocate a per-task audit context + * if necessary. Doing so turns on system call auditing for the + * specified task. This is called from copy_process, so no lock is + * needed. */ +int audit_alloc(struct task_struct *tsk) +{ + struct audit_context *context; + enum audit_state state; + + if (likely(!audit_enabled)) + return 0; /* Return if not auditing. */ + + state = audit_filter_task(tsk); + if (likely(state == AUDIT_DISABLED)) + return 0; + + if (!(context = audit_alloc_context(state))) { + audit_log_lost("out of memory in audit_alloc"); + return -ENOMEM; + } + + /* Preserve login uid */ + context->loginuid = -1; + if (tsk->audit_context) + context->loginuid = tsk->audit_context->loginuid; + + tsk->audit_context = context; + set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + return 0; +} + +static inline void audit_free_context(struct audit_context *context) +{ + struct audit_context *previous; + int count = 0; + + do { + previous = context->previous; + if (previous || (count && count < 10)) { + ++count; + printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" + " freeing multiple contexts (%d)\n", + context->serial, context->major, + context->name_count, count); + } + audit_free_names(context); + kfree(context); + context = previous; + } while (context); + if (count >= 10) + printk(KERN_ERR "audit: freed %d contexts\n", count); +} + +static void audit_log_exit(struct audit_context *context) +{ + int i; + struct audit_buffer *ab; + + ab = audit_log_start(context); + if (!ab) + return; /* audit_panic has been called */ + audit_log_format(ab, "syscall=%d", context->major); + if (context->personality != PER_LINUX) + audit_log_format(ab, " per=%lx", context->personality); + if (context->return_valid) + audit_log_format(ab, " exit=%u", context->return_code); + audit_log_format(ab, + " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" + " pid=%d loginuid=%d uid=%d gid=%d" + " euid=%d suid=%d fsuid=%d" + " egid=%d sgid=%d fsgid=%d", + context->argv[0], + context->argv[1], + context->argv[2], + context->argv[3], + context->name_count, + context->pid, + context->loginuid, + context->uid, + context->gid, + context->euid, context->suid, context->fsuid, + context->egid, context->sgid, context->fsgid); + audit_log_end(ab); + for (i = 0; i < context->name_count; i++) { + ab = audit_log_start(context); + if (!ab) + continue; /* audit_panic has been called */ + audit_log_format(ab, "item=%d", i); + if (context->names[i].name) + audit_log_format(ab, " name=%s", + context->names[i].name); + if (context->names[i].ino != (unsigned long)-1) + audit_log_format(ab, " inode=%lu", + context->names[i].ino); + /* FIXME: should use format_dev_t, but ab structure is + * opaque. */ + if (context->names[i].rdev != -1) + audit_log_format(ab, " dev=%02x:%02x", + MAJOR(context->names[i].rdev), + MINOR(context->names[i].rdev)); + audit_log_end(ab); + } +} + +/* Free a per-task audit context. Called from copy_process and + * __put_task_struct. */ +void audit_free(struct task_struct *tsk) +{ + struct audit_context *context; + + task_lock(tsk); + context = audit_get_context(tsk, 0, 0); + task_unlock(tsk); + + if (likely(!context)) + return; + + /* Check for system calls that do not go through the exit + * function (e.g., exit_group), then free context block. */ + if (context->in_syscall && context->auditable) + audit_log_exit(context); + + audit_free_context(context); +} + +/* Compute a serial number for the audit record. Audit records are + * written to user-space as soon as they are generated, so a complete + * audit record may be written in several pieces. The timestamp of the + * record and this serial number are used by the user-space daemon to + * determine which pieces belong to the same audit record. The + * (timestamp,serial) tuple is unique for each syscall and is live from + * syscall entry to syscall exit. + * + * Atomic values are only guaranteed to be 24-bit, so we count down. + * + * NOTE: Another possibility is to store the formatted records off the + * audit context (for those records that have a context), and emit them + * all at syscall exit. However, this could delay the reporting of + * significant errors until syscall exit (or never, if the system + * halts). */ +static inline unsigned int audit_serial(void) +{ + static atomic_t serial = ATOMIC_INIT(0xffffff); + unsigned int a, b; + + do { + a = atomic_read(&serial); + if (atomic_dec_and_test(&serial)) + atomic_set(&serial, 0xffffff); + b = atomic_read(&serial); + } while (b != a - 1); + + return 0xffffff - b; +} + +/* Fill in audit context at syscall entry. This only happens if the + * audit context was created when the task was created and the state or + * filters demand the audit context be built. If the state from the + * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, + * then the record will be written at syscall exit time (otherwise, it + * will only be written if another part of the kernel requests that it + * be written). */ +void audit_syscall_entry(struct task_struct *tsk, int major, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4) +{ + struct audit_context *context = tsk->audit_context; + enum audit_state state; + + BUG_ON(!context); + + /* This happens only on certain architectures that make system + * calls in kernel_thread via the entry.S interface, instead of + * with direct calls. (If you are porting to a new + * architecture, hitting this condition can indicate that you + * got the _exit/_leave calls backward in entry.S.) + * + * i386 no + * x86_64 no + * ppc64 yes (see arch/ppc64/kernel/misc.S) + * + * This also happens with vm86 emulation in a non-nested manner + * (entries without exits), so this case must be caught. + */ + if (context->in_syscall) { + struct audit_context *newctx; + +#if defined(__NR_vm86) && defined(__NR_vm86old) + /* vm86 mode should only be entered once */ + if (major == __NR_vm86 || major == __NR_vm86old) + return; +#endif +#if AUDIT_DEBUG + printk(KERN_ERR + "audit(:%d) pid=%d in syscall=%d;" + " entering syscall=%d\n", + context->serial, tsk->pid, context->major, major); +#endif + newctx = audit_alloc_context(context->state); + if (newctx) { + newctx->previous = context; + context = newctx; + tsk->audit_context = newctx; + } else { + /* If we can't alloc a new context, the best we + * can do is to leak memory (any pending putname + * will be lost). The only other alternative is + * to abandon auditing. */ + audit_zero_context(context, context->state); + } + } + BUG_ON(context->in_syscall || context->name_count); + + if (!audit_enabled) + return; + + context->major = major; + context->argv[0] = a1; + context->argv[1] = a2; + context->argv[2] = a3; + context->argv[3] = a4; + + state = context->state; + if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) + state = audit_filter_syscall(tsk, context, &audit_entlist); + if (likely(state == AUDIT_DISABLED)) + return; + + context->serial = audit_serial(); + context->ctime = CURRENT_TIME; + context->in_syscall = 1; + context->auditable = !!(state == AUDIT_RECORD_CONTEXT); +} + +/* Tear down after system call. If the audit context has been marked as + * auditable (either because of the AUDIT_RECORD_CONTEXT state from + * filtering, or because some other part of the kernel write an audit + * message), then write out the syscall information. In call cases, + * free the names stored from getname(). */ +void audit_syscall_exit(struct task_struct *tsk, int return_code) +{ + struct audit_context *context; + + get_task_struct(tsk); + task_lock(tsk); + context = audit_get_context(tsk, 1, return_code); + task_unlock(tsk); + + /* Not having a context here is ok, since the parent may have + * called __put_task_struct. */ + if (likely(!context)) + return; + + if (context->in_syscall && context->auditable) + audit_log_exit(context); + + context->in_syscall = 0; + context->auditable = 0; + if (context->previous) { + struct audit_context *new_context = context->previous; + context->previous = NULL; + audit_free_context(context); + tsk->audit_context = new_context; + } else { + audit_free_names(context); + audit_zero_context(context, context->state); + tsk->audit_context = context; + } + put_task_struct(tsk); +} + +/* Add a name to the list. Called from fs/namei.c:getname(). */ +void audit_getname(const char *name) +{ + struct audit_context *context = current->audit_context; + + BUG_ON(!context); + if (!context->in_syscall) { +#if AUDIT_DEBUG == 2 + printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", + __FILE__, __LINE__, context->serial, name); + dump_stack(); +#endif + return; + } + BUG_ON(context->name_count >= AUDIT_NAMES); + context->names[context->name_count].name = name; + context->names[context->name_count].ino = (unsigned long)-1; + context->names[context->name_count].rdev = -1; + ++context->name_count; +} + +/* Intercept a putname request. Called from + * include/linux/fs.h:putname(). If we have stored the name from + * getname in the audit context, then we delay the putname until syscall + * exit. */ +void audit_putname(const char *name) +{ + struct audit_context *context = current->audit_context; + + BUG_ON(!context); + if (!context->in_syscall) { +#if AUDIT_DEBUG == 2 + printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", + __FILE__, __LINE__, context->serial, name); + if (context->name_count) { + int i; + for (i = 0; i < context->name_count; i++) + printk(KERN_ERR "name[%d] = %p = %s\n", i, + context->names[i].name, + context->names[i].name); + } +#endif + __putname(name); + } +#if AUDIT_DEBUG + else { + ++context->put_count; + if (context->put_count > context->name_count) { + printk(KERN_ERR "%s:%d(:%d): major=%d" + " in_syscall=%d putname(%p) name_count=%d" + " put_count=%d\n", + __FILE__, __LINE__, + context->serial, context->major, + context->in_syscall, name, context->name_count, + context->put_count); + dump_stack(); + } + } +#endif +} + +/* Store the inode and device from a lookup. Called from + * fs/namei.c:path_lookup(). */ +void audit_inode(const char *name, unsigned long ino, dev_t rdev) +{ + int idx; + struct audit_context *context = current->audit_context; + + if (!context->in_syscall) + return; + if (context->name_count + && context->names[context->name_count-1].name + && context->names[context->name_count-1].name == name) + idx = context->name_count - 1; + else if (context->name_count > 1 + && context->names[context->name_count-2].name + && context->names[context->name_count-2].name == name) + idx = context->name_count - 2; + else { + /* FIXME: how much do we care about inodes that have no + * associated name? */ + if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED) + return; + idx = context->name_count++; + context->names[idx].name = NULL; +#if AUDIT_DEBUG + ++context->ino_count; +#endif + } + context->names[idx].ino = ino; + context->names[idx].rdev = rdev; +} + +void audit_get_stamp(struct audit_context *ctx, + struct timespec *t, int *serial) +{ + if (ctx) { + t->tv_sec = ctx->ctime.tv_sec; + t->tv_nsec = ctx->ctime.tv_nsec; + *serial = ctx->serial; + ctx->auditable = 1; + } else { + *t = CURRENT_TIME; + *serial = 0; + } +} + +int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid) +{ + if (ctx) { + if (loginuid < 0) + return -EINVAL; + ctx->loginuid = loginuid; + } + return 0; +} + +EXPORT_SYMBOL_GPL(audit_alloc); +EXPORT_SYMBOL_GPL(audit_free); +EXPORT_SYMBOL_GPL(audit_syscall_entry); +EXPORT_SYMBOL_GPL(audit_syscall_exit); +EXPORT_SYMBOL_GPL(audit_getname); +EXPORT_SYMBOL_GPL(audit_putname); +EXPORT_SYMBOL_GPL(audit_inode); diff --git a/kernel/exit.c b/kernel/exit.c index 308f6959add6..0ec66729ead8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -92,7 +92,6 @@ repeat: p->parent->cstime += p->stime + p->cstime; p->parent->cmin_flt += p->min_flt + p->cmin_flt; p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt; - p->parent->cnswap += p->nswap + p->cnswap; p->parent->cnvcsw += p->nvcsw + p->cnvcsw; p->parent->cnivcsw += p->nivcsw + p->cnivcsw; sched_exit(p); @@ -136,13 +135,13 @@ int session_of_pgrp(int pgrp) read_lock(&tasklist_lock); for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) - if (p->session > 0) { - sid = p->session; + if (p->signal->session > 0) { + sid = p->signal->session; goto out; } p = find_task_by_pid(pgrp); if (p) - sid = p->session; + sid = p->signal->session; out: read_unlock(&tasklist_lock); @@ -170,7 +169,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) || p->real_parent->pid == 1) continue; if (process_group(p->real_parent) != pgrp - && p->real_parent->session == p->session) { + && p->real_parent->signal->session == p->signal->session) { ret = 0; break; } @@ -259,14 +258,14 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current; - if (curr->session != session) { + if (curr->signal->session != session) { detach_pid(curr, PIDTYPE_SID); - curr->session = session; + curr->signal->session = session; attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { detach_pid(curr, PIDTYPE_PGID); - curr->group_leader->__pgrp = pgrp; + curr->signal->pgrp = pgrp; attach_pid(curr, PIDTYPE_PGID, pgrp); } } @@ -341,7 +340,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - current->tty = NULL; + current->signal->tty = NULL; /* Block and flush all signals */ sigfillset(&blocked); @@ -564,7 +563,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->session == father->session)) { + (p->signal->session == father->signal->session)) { int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { @@ -675,7 +674,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->session == tsk->session) && + (t->signal->session == tsk->signal->session) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); @@ -777,10 +776,9 @@ asmlinkage NORET_TYPE void do_exit(long code) __exit_files(tsk); __exit_fs(tsk); exit_namespace(tsk); - exit_itimers(tsk); exit_thread(); - if (tsk->leader) + if (tsk->signal->leader) disassociate_ctty(1); module_put(tsk->thread_info->exec_domain->module); diff --git a/kernel/fork.c b/kernel/fork.c index 3b17a249c50d..4f5b018777d8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -21,6 +21,7 @@ #include <linux/completion.h> #include <linux/namespace.h> #include <linux/personality.h> +#include <linux/sem.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/mman.h> @@ -31,6 +32,7 @@ #include <linux/futex.h> #include <linux/ptrace.h> #include <linux/mount.h> +#include <linux/audit.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -39,9 +41,6 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); -extern void exit_sem(struct task_struct *tsk); - /* The idle threads do not count.. * Protected by write_lock_irq(&tasklist_lock) */ @@ -85,6 +84,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + if (unlikely(tsk->audit_context)) + audit_free(tsk); security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); @@ -209,11 +210,14 @@ EXPORT_SYMBOL(autoremove_wake_function); void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef ARCH_MIN_TASKALIGN +#define ARCH_MIN_TASKALIGN 0 +#endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - sizeof(struct task_struct),0, - SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + sizeof(struct task_struct),ARCH_MIN_TASKALIGN, + 0, NULL, NULL); if (!task_struct_cachep) panic("fork_init(): cannot create task_struct SLAB cache"); #endif @@ -322,7 +326,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) /* insert tmp into the share list, just after mpnt */ down(&file->f_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); + list_add(&tmp->shared, &mpnt->shared); up(&file->f_mapping->i_shared_sem); } @@ -512,7 +516,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nswap = tsk->cnswap = 0; tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0; tsk->mm = NULL; @@ -812,6 +815,13 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->group_stop_count = 0; sig->curr_target = NULL; init_sigpending(&sig->shared_pending); + INIT_LIST_HEAD(&sig->posix_timers); + + sig->tty = current->signal->tty; + sig->pgrp = process_group(current); + sig->session = current->signal->session; + sig->leader = 0; /* session leadership doesn't inherit */ + sig->tty_old_pgrp = 0; return 0; } @@ -923,7 +933,6 @@ struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); - INIT_LIST_HEAD(&p->posix_timers); init_waitqueue_head(&p->wait_chldexit); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); @@ -937,21 +946,22 @@ struct task_struct *copy_process(unsigned long clone_flags, init_timer(&p->real_timer); p->real_timer.data = (unsigned long) p; - p->leader = 0; /* session leadership doesn't inherit */ - p->tty_old_pgrp = 0; p->utime = p->stime = 0; p->cutime = p->cstime = 0; p->lock_depth = -1; /* -1 = no lock */ p->start_time = get_jiffies_64(); p->security = NULL; p->io_context = NULL; + p->audit_context = NULL; retval = -ENOMEM; if ((retval = security_task_alloc(p))) goto bad_fork_cleanup; + if ((retval = audit_alloc(p))) + goto bad_fork_cleanup_security; /* copy all the process information */ if ((retval = copy_semundo(clone_flags, p))) - goto bad_fork_cleanup_security; + goto bad_fork_cleanup_audit; if ((retval = copy_files(clone_flags, p))) goto bad_fork_cleanup_semundo; if ((retval = copy_fs(clone_flags, p))) @@ -1057,7 +1067,7 @@ struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_TGID, p->tgid); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->session); + attach_pid(p, PIDTYPE_SID, p->signal->session); if (p->pid) __get_cpu_var(process_counts)++; } else @@ -1076,6 +1086,8 @@ bad_fork_cleanup_namespace: exit_namespace(p); bad_fork_cleanup_mm: exit_mm(p); + if (p->active_mm) + mmdrop(p->active_mm); bad_fork_cleanup_signal: exit_signal(p); bad_fork_cleanup_sighand: @@ -1086,6 +1098,8 @@ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); +bad_fork_cleanup_audit: + audit_free(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup: diff --git a/kernel/kmod.c b/kernel/kmod.c index 5261de82029b..0002fcd4c554 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) }; DECLARE_WORK(work, __call_usermodehelper, &sub_info); - if (!system_running) + if (system_state != SYSTEM_RUNNING) return -EBUSY; if (path[0] == '\0') diff --git a/kernel/module.c b/kernel/module.c index 16587e133b1b..a472deef9bdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -493,7 +493,6 @@ static inline int __try_stop_module(void *_sref) } /* Mark it as dying. */ - sref->mod->waiter = current; sref->mod->state = MODULE_STATE_GOING; return 0; } @@ -588,6 +587,9 @@ sys_delete_module(const char __user *name_user, unsigned int flags) } } + /* Set this up before setting mod->state */ + mod->waiter = current; + /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); diff --git a/kernel/params.c b/kernel/params.c index 4d9a71b743c5..59667bce9ce0 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -96,6 +96,13 @@ static char *next_arg(char *args, char **param, char **val) else { args[equals] = '\0'; *val = args + equals + 1; + + /* Don't include quotes in value. */ + if (**val == '"') { + (*val)++; + if (args[i-1] == '"') + args[i-1] = '\0'; + } } if (args[i]) { diff --git a/kernel/pid.c b/kernel/pid.c index 4c85144759c5..6ed44f56ca45 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread) attach_pid(thread, PIDTYPE_PID, thread->pid); attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, leader->__pgrp); - attach_pid(thread, PIDTYPE_SID, thread->session); + attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); + attach_pid(thread, PIDTYPE_SID, thread->signal->session); list_add_tail(&thread->tasks, &init_task.tasks); attach_pid(leader, PIDTYPE_PID, leader->pid); attach_pid(leader, PIDTYPE_TGID, leader->tgid); - attach_pid(leader, PIDTYPE_PGID, leader->__pgrp); - attach_pid(leader, PIDTYPE_SID, leader->session); + attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); + attach_pid(leader, PIDTYPE_SID, leader->signal->session); } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 082693e383cf..3de4d0ae9d26 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -317,12 +317,21 @@ static void timer_notify_task(struct k_itimer *timr) if (timr->it_incr) timr->sigq->info.si_sys_private = ++timr->it_requeue_pending; - if (timr->it_sigev_notify & SIGEV_THREAD_ID ) + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { + if (unlikely(timr->it_process->flags & PF_EXITING)) { + timr->it_sigev_notify = SIGEV_SIGNAL; + put_task_struct(timr->it_process); + timr->it_process = timr->it_process->group_leader; + goto group; + } ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, timr->it_process); - else + } + else { + group: ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, timr->it_process); + } if (ret) { /* * signal was not sent because of sig_ignor @@ -352,7 +361,7 @@ static void posix_timer_fn(unsigned long __data) static inline struct task_struct * good_sigevent(sigevent_t * event) { - struct task_struct *rtn = current; + struct task_struct *rtn = current->group_leader; if ((event->sigev_notify & SIGEV_THREAD_ID ) && (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || @@ -395,11 +404,15 @@ static struct k_itimer * alloc_posix_timer(void) static void release_posix_timer(struct k_itimer *tmr) { if (tmr->it_id != -1) { - spin_lock_irq(&idr_lock); + unsigned long flags; + spin_lock_irqsave(&idr_lock, flags); idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irq(&idr_lock); + spin_unlock_irqrestore(&idr_lock, flags); } sigqueue_free(tmr->sigq); + if (unlikely(tmr->it_process) && + tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(tmr->it_process); kmem_cache_free(posix_timers_cache, tmr); } @@ -414,6 +427,7 @@ sys_timer_create(clockid_t which_clock, struct k_itimer *new_timer = NULL; timer_t new_timer_id; struct task_struct *process = 0; + unsigned long flags; sigevent_t event; if ((unsigned) which_clock >= MAX_CLOCKS || @@ -458,7 +472,7 @@ sys_timer_create(clockid_t which_clock, * We may be setting up this process for another * thread. It may be exiting. To catch this * case the we check the PF_EXITING flag. If - * the flag is not set, the task_lock will catch + * the flag is not set, the siglock will catch * him before it is too late (in exit_itimers). * * The exec case is a bit more invloved but easy @@ -469,13 +483,14 @@ sys_timer_create(clockid_t which_clock, * for us to die which means we can finish this * linkage with our last gasp. I.e. no code :) */ - task_lock(process); + spin_lock_irqsave(&process->sighand->siglock, flags); if (!(process->flags & PF_EXITING)) { list_add(&new_timer->list, - &process->posix_timers); - task_unlock(process); + &process->signal->posix_timers); + spin_unlock_irqrestore(&process->sighand->siglock, flags); + get_task_struct(process); } else { - task_unlock(process); + spin_unlock_irqrestore(&process->sighand->siglock, flags); process = 0; } } @@ -491,10 +506,10 @@ sys_timer_create(clockid_t which_clock, new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->it_sigev_signo = SIGALRM; new_timer->it_sigev_value.sival_int = new_timer->it_id; - process = current; - task_lock(process); - list_add(&new_timer->list, &process->posix_timers); - task_unlock(process); + process = current->group_leader; + spin_lock_irqsave(&process->sighand->siglock, flags); + list_add(&new_timer->list, &process->signal->posix_timers); + spin_unlock_irqrestore(&process->sighand->siglock, flags); } new_timer->it_clock = which_clock; @@ -925,14 +940,18 @@ retry_delete: #else p_timer_del(&posix_clocks[timer->it_clock], timer); #endif - task_lock(timer->it_process); + spin_lock(¤t->sighand->siglock); list_del(&timer->list); - task_unlock(timer->it_process); + spin_unlock(¤t->sighand->siglock); /* * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ + if (timer->it_process) { + if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(timer->it_process); timer->it_process = NULL; + } unlock_timer(timer, flags); release_posix_timer(timer); return 0; @@ -942,24 +961,50 @@ retry_delete: */ static inline void itimer_delete(struct k_itimer *timer) { - if (sys_timer_delete(timer->it_id)) - BUG(); + unsigned long flags; + +#ifdef CONFIG_SMP + int error; +retry_delete: +#endif + spin_lock_irqsave(&timer->it_lock, flags); + +#ifdef CONFIG_SMP + error = p_timer_del(&posix_clocks[timer->it_clock], timer); + + if (error == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } +#else + p_timer_del(&posix_clocks[timer->it_clock], timer); +#endif + list_del(&timer->list); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + if (timer->it_process) { + if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(timer->it_process); + timer->it_process = NULL; + } + unlock_timer(timer, flags); + release_posix_timer(timer); } + /* - * This is exported to exit and exec + * This is called by __exit_signal, only when there are no more + * references to the shared signal_struct. */ -void exit_itimers(struct task_struct *tsk) +void exit_itimers(struct signal_struct *sig) { struct k_itimer *tmr; - task_lock(tsk); - while (!list_empty(&tsk->posix_timers)) { - tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list); - task_unlock(tsk); + while (!list_empty(&sig->posix_timers)) { + tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); - task_lock(tsk); } - task_unlock(tsk); } /* diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 033eea403f26..6bb62269f3eb 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -9,9 +9,9 @@ config PM Power Management is most important for battery powered laptop computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at - <http://www.cs.utexas.edu/users/kharker/linux-laptop/> and the - Battery Powered Linux mini-HOWTO, available from + page on the WWW at <http://www.linux-on-laptops.com/> or + Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> + and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. Note that, even if you say N here, Linux on the x86 architecture @@ -44,7 +44,7 @@ config SOFTWARE_SUSPEND config PM_DISK bool "Suspend-to-Disk Support" - depends on PM && SWAP + depends on PM && SWAP && X86 && !X86_64 ---help--- Suspend-to-disk is a power management state in which the contents of memory are stored on disk and the entire system is shut down or diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 2d4cf319b8e1..6abcf99b7ada 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -84,7 +84,6 @@ static void free_some_memory(void) while (shrink_all_memory(10000)) printk("."); printk("|\n"); - blk_run_queues(); } @@ -285,11 +284,16 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) { int error = 0; int i; + int len; + char *p; u32 mode = 0; + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + down(&pm_sem); for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { - if (!strcmp(buf,pm_disk_modes[i])) { + if (!strncmp(buf, pm_disk_modes[i], len)) { mode = i; break; } diff --git a/kernel/power/main.c b/kernel/power/main.c index fd212e7ecd9f..d582906fecc6 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -218,10 +218,15 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n { u32 state = PM_SUSPEND_STANDBY; char ** s; + char *p; int error; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; for (s = &pm_states[state]; *s; s++, state++) { - if (!strcmp(buf,*s)) + if (!strncmp(buf, *s, len)) break; } if (*s) diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c index 1dc29b53a25e..22855abbdd6e 100644 --- a/kernel/power/pmdisk.c +++ b/kernel/power/pmdisk.c @@ -35,7 +35,7 @@ #include "power.h" -extern int pmdisk_arch_suspend(int resume); +extern asmlinkage int pmdisk_arch_suspend(int resume); #define __ADDRESS(x) ((unsigned long) phys_to_virt(x)) #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT) @@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err) static void wait_io(void) { - blk_run_queues(); while(atomic_read(&io_done)) io_schedule(); } @@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page) if (rw == WRITE) bio_set_pages_dirty(bio); start_io(); - submit_bio(rw,bio); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); wait_io(); Done: bio_put(bio); diff --git a/kernel/power/process.c b/kernel/power/process.c index 15c1b340c2ed..8225457183ed 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,7 +30,8 @@ static inline int freezeable(struct task_struct * p) if ((p == current) || (p->flags & PF_IOTHREAD) || (p->state == TASK_ZOMBIE) || - (p->state == TASK_DEAD)) + (p->state == TASK_DEAD) || + (p->state == TASK_STOPPED)) return 0; return 1; } @@ -38,21 +39,19 @@ static inline int freezeable(struct task_struct * p) /* Refrigerator is place where frozen processes are stored :-). */ void refrigerator(unsigned long flag) { - /* You need correct to work with real-time processes. - OTOH, this way one process may see (via /proc/) some other - process in stopped state (and thereby discovered we were - suspended. We probably do not care. - */ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ long save; save = current->state; - current->state = TASK_STOPPED; + current->state = TASK_UNINTERRUPTIBLE; pr_debug("%s entered refrigerator\n", current->comm); printk("="); current->flags &= ~PF_FREEZE; - if (flag) - flush_signals(current); /* We have signaled a kernel thread, which isn't normal behaviour - and that may lead to 100%CPU sucking because those threads - just don't manage signals. */ + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + current->flags |= PF_FROZEN; while (current->flags & PF_FROZEN) schedule(); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 20134ab8e0b2..8f78d6807576 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -1,11 +1,11 @@ /* - * linux/kernel/suspend.c + * linux/kernel/power/swsusp.c * * This file is to realize architecture-independent * machine suspend feature using pretty near only high-level routines * * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> * * This file is released under the GPLv2. * @@ -61,6 +61,7 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/console.h> +#include <linux/highmem.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -74,11 +75,6 @@ unsigned char software_suspend_enabled = 0; #define NORESUME 1 #define RESUME_SPECIFIED 2 - -#define __ADDRESS(x) ((unsigned long) phys_to_virt(x)) -#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT) -#define ADDRESS2(x) __ADDRESS(__pa(x)) /* Needed for x86-64 where some pages are in memory twice */ - /* References to section boundaries */ extern char __nosave_begin, __nosave_end; @@ -105,6 +101,10 @@ unsigned int nr_copy_pages __nosavedata = 0; time of suspend, that must be freed. Second is "pagedir_nosave", allocated at time of resume, that travels through memory not to collide with anything. + + Warning: this is even more evil than it seems. Pagedirs this file + talks about are completely different from page directories used by + MMU hardware. */ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; static suspend_pagedir_t *pagedir_save; @@ -139,15 +139,15 @@ static const char name_resume[] = "Resume Machine: "; #define TEST_SWSUSP 0 /* Set to 1 to reboot instead of halt machine after suspension */ #ifdef DEBUG_DEFAULT -# define PRINTK(f, a...) printk(f, ## a) +# define PRINTK(f, a...) printk(f, ## a) #else -# define PRINTK(f, a...) +# define PRINTK(f, a...) do { } while(0) #endif #ifdef DEBUG_SLOW #define MDELAY(a) mdelay(a) #else -#define MDELAY(a) +#define MDELAY(a) do { } while(0) #endif /* @@ -225,6 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode) static void read_swapfiles(void) /* This is called before saving image */ { int i, len; + static char buff[sizeof(resume_file)], *sname; len=strlen(resume_file); root_swap = 0xFFFF; @@ -243,8 +244,11 @@ static void read_swapfiles(void) /* This is called before saving image */ swapfile_used[i] = SWAPFILE_IGNORED; } else { /* we ignore all swap devices that are not the resume_file */ - if (1) { -// FIXME if(resume_device == swap_info[i].swap_device) { + sname = d_path(swap_info[i].swap_file->f_dentry, + swap_info[i].swap_file->f_vfsmnt, + buff, + sizeof(buff)); + if (!strcmp(sname, resume_file)) { swapfile_used[i] = SWAPFILE_SUSPEND; root_swap = i; } else { @@ -346,7 +350,7 @@ static int write_suspend_image(void) cur = (void *) buffer; if (fill_suspend_header(&cur->sh)) - panic("\nOut of memory while writing header"); + BUG(); /* Not a BUG_ON(): we want fill_suspend_header to be called, always */ cur->link.next = prev; @@ -362,73 +366,174 @@ static int write_suspend_image(void) return 0; } -/* if pagedir_p != NULL it also copies the counted pages */ -static int count_and_copy_data_pages(struct pbe *pagedir_p) -{ - int chunk_size; - int nr_copy_pages = 0; - int pfn; +#ifdef CONFIG_HIGHMEM +struct highmem_page { + char *data; struct page *page; - -#ifdef CONFIG_DISCONTIGMEM - panic("Discontingmem not supported"); -#else - BUG_ON (max_pfn != num_physpages); -#endif - for (pfn = 0; pfn < max_pfn; pfn++) { + struct highmem_page *next; +}; + +struct highmem_page *highmem_copy = NULL; + +static int save_highmem_zone(struct zone *zone) +{ + unsigned long zone_pfn; + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + struct highmem_page *save; + void *kaddr; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + int chunk_size; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; page = pfn_to_page(pfn); - if (PageHighMem(page)) - panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-)."); + /* + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. + */ + if (PageReserved(page)) { + printk("highmem reserved page?!\n"); + continue; + } + if ((chunk_size = is_head_of_free_region(page))) { + pfn += chunk_size - 1; + zone_pfn += chunk_size - 1; + continue; + } + save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); + if (!save) + return -ENOMEM; + save->next = highmem_copy; + save->page = page; + save->data = (void *) get_zeroed_page(GFP_ATOMIC); + if (!save->data) { + kfree(save); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(save->data, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + highmem_copy = save; + } + return 0; +} - if (!PageReserved(page)) { - if (PageNosave(page)) - continue; +static int save_highmem(void) +{ + struct zone *zone; + int res = 0; + for_each_zone(zone) { + if (is_highmem(zone)) + res = save_highmem_zone(zone); + if (res) + return res; + } + return 0; +} - if ((chunk_size=is_head_of_free_region(page))!=0) { - pfn += chunk_size - 1; - continue; - } - } else if (PageReserved(page)) { - BUG_ON (PageNosave(page)); +static int restore_highmem(void) +{ + while (highmem_copy) { + struct highmem_page *save = highmem_copy; + void *kaddr; + highmem_copy = save->next; + + kaddr = kmap_atomic(save->page, KM_USER0); + memcpy(kaddr, save->data, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + free_page((long) save->data); + kfree(save); + } + return 0; +} +#endif - /* - * Just copy whole code segment. Hopefully it is not that big. - */ - if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && - (ADDRESS(pfn) < (unsigned long) ADDRESS2(&__nosave_end))) { - PRINTK("[nosave %lx]", ADDRESS(pfn)); - continue; - } - /* Hmm, perhaps copying all reserved pages is not too healthy as they may contain - critical bios data? */ - } else BUG(); +static int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} - nr_copy_pages++; - if (pagedir_p) { - pagedir_p->orig_address = ADDRESS(pfn); - copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address); - pagedir_p++; +/* if *pagedir_p != NULL it also copies the counted pages */ +static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p) +{ + unsigned long zone_pfn, chunk_size, nr_copy_pages = 0; + struct pbe *pbe = *pagedir_p; + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) + continue; + if (PageReserved(page) && pfn_is_nosave(pfn)) { + PRINTK("[nosave pfn 0x%lx]", pfn); + continue; + } + if ((chunk_size = is_head_of_free_region(page))) { + pfn += chunk_size - 1; + zone_pfn += chunk_size - 1; + continue; } + nr_copy_pages++; + if (!pbe) + continue; + pbe->orig_address = (long) page_address(page); + copy_page((void *)pbe->address, (void *)pbe->orig_address); + pbe++; } + *pagedir_p = pbe; return nr_copy_pages; } -static void free_suspend_pagedir(unsigned long this_pagedir) +static int count_and_copy_data_pages(struct pbe *pagedir_p) { - struct page *page; - int pfn; - unsigned long this_pagedir_end = this_pagedir + - (PAGE_SIZE << pagedir_order); + int nr_copy_pages = 0; + struct zone *zone; + for_each_zone(zone) { + if (!is_highmem(zone)) + nr_copy_pages += count_and_copy_zone(zone, &pagedir_p); + } + return nr_copy_pages; +} - for(pfn = 0; pfn < num_physpages; pfn++) { +static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir) +{ + unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn; + pagedir_end = pagedir + (PAGE_SIZE << pagedir_order); + pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT; + pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT; + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + if (!pfn_valid(pfn)) + continue; page = pfn_to_page(pfn); if (!TestClearPageNosave(page)) continue; + else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn) + continue; + __free_page(page); + } +} - if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end) - continue; /* old pagedir gets freed in one */ - - free_page(ADDRESS(pfn)); +static void free_suspend_pagedir(unsigned long this_pagedir) +{ + struct zone *zone; + for_each_zone(zone) { + if (!is_highmem(zone)) + free_suspend_pagedir_zone(zone, this_pagedir); } free_pages(this_pagedir, pagedir_order); } @@ -443,7 +548,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages) pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages)); p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order); - if(!pagedir) + if (!pagedir) return NULL; page = virt_to_page(pagedir); @@ -452,7 +557,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages) while(nr_copy_pages--) { p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); - if(!p->address) { + if (!p->address) { free_suspend_pagedir((unsigned long) pagedir); return NULL; } @@ -492,10 +597,19 @@ static int suspend_prepare_image(void) struct sysinfo i; unsigned int nr_needed_pages = 0; - drain_local_pages(); - pagedir_nosave = NULL; - printk( "/critical section: Counting pages to copy" ); + printk( "/critical section: "); +#ifdef CONFIG_HIGHMEM + printk( "handling highmem" ); + if (save_highmem()) { + printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend); + return -ENOMEM; + } + printk(", "); +#endif + + printk("counting pages to copy" ); + drain_local_pages(); nr_copy_pages = count_and_copy_data_pages(NULL); nr_needed_pages = nr_copy_pages + PAGES_FOR_IO; @@ -504,23 +618,22 @@ static int suspend_prepare_image(void) printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n", name_suspend, nr_needed_pages-nr_free_pages()); root_swap = 0xFFFF; - return 1; + return -ENOMEM; } si_swapinfo(&i); /* FIXME: si_swapinfo(&i) returns all swap devices information. We should only consider resume_device. */ if (i.freeswap < nr_needed_pages) { printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n", name_suspend, nr_needed_pages-i.freeswap); - return 1; + return -ENOSPC; } PRINTK( "Alloc pagedir\n" ); pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages); - if(!pagedir_nosave) { - /* Shouldn't happen */ - printk(KERN_CRIT "%sCouldn't allocate enough pages\n",name_suspend); - panic("Really should not happen"); - return 1; + if (!pagedir_nosave) { + /* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */ + printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend); + return -ENOMEM; } nr_copy_pages_check = nr_copy_pages; pagedir_order_check = pagedir_order; @@ -603,21 +716,25 @@ asmlinkage void do_magic_resume_2(void) PRINTK( "Freeing prev allocated pagedir\n" ); free_suspend_pagedir((unsigned long) pagedir_save); + +#ifdef CONFIG_HIGHMEM + printk( "Restoring highmem\n" ); + restore_highmem(); +#endif + printk("done, devices\n"); + device_power_up(); spin_unlock_irq(&suspend_pagedir_lock); device_resume(); - acquire_console_sem(); - update_screen(fg_console); /* Hmm, is this the problem? */ - release_console_sem(); - + /* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */ PRINTK( "Fixing swap signatures... " ); mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME); PRINTK( "ok\n" ); #ifdef SUSPEND_CONSOLE acquire_console_sem(); - update_screen(fg_console); /* Hmm, is this the problem? */ + update_screen(fg_console); release_console_sem(); #endif } @@ -707,11 +824,6 @@ int software_suspend(void) free_some_memory(); - /* No need to invalidate any vfsmnt list -- - * they will be valid after resume, anyway. - */ - blk_run_queues(); - /* Save state of all device drivers, and stop them. */ if ((res = device_suspend(4))==0) /* If stopping device drivers worked, we proceed basically into diff --git a/kernel/printk.c b/kernel/printk.c index a7be1f922f34..5f2b3c9bbd6e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id()) && !system_running) { + if (!cpu_online(smp_processor_id()) && + system_state != SYSTEM_RUNNING) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this diff --git a/kernel/sched.c b/kernel/sched.c index d5f21712ffbb..c2d1c0317130 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +extern unsigned long __scheduling_functions_start_here; +extern unsigned long __scheduling_functions_end_here; +const unsigned long scheduling_functions_start_here = + (unsigned long)&__scheduling_functions_start_here; +const unsigned long scheduling_functions_end_here = + (unsigned long)&__scheduling_functions_end_here; + /* * Default context-switch locking: */ @@ -1587,12 +1594,10 @@ out: rebalance_tick(rq, 0); } -void scheduling_functions_start_here(void) { } - /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void __sched schedule(void) { long *switch_count; task_t *prev, *next; @@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule); * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void preempt_schedule(void) +asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -1842,7 +1847,6 @@ void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exc __wake_up_common(q, mode, nr_exclusive, 0); spin_unlock_irqrestore(&q->lock, flags); } - EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ void fastcall complete(struct completion *x) @@ -1855,7 +1859,6 @@ void fastcall complete(struct completion *x) 1, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } - EXPORT_SYMBOL(complete); void fastcall complete_all(struct completion *x) @@ -1868,8 +1871,9 @@ void fastcall complete_all(struct completion *x) 0, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } +EXPORT_SYMBOL(complete_all); -void fastcall wait_for_completion(struct completion *x) +void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -1889,7 +1893,6 @@ void fastcall wait_for_completion(struct completion *x) x->done--; spin_unlock_irq(&x->wait.lock); } - EXPORT_SYMBOL(wait_for_completion); #define SLEEP_ON_VAR \ @@ -1907,7 +1910,7 @@ EXPORT_SYMBOL(wait_for_completion); __remove_wait_queue(q, &wait); \ spin_unlock_irqrestore(&q->lock, flags); -void fastcall interruptible_sleep_on(wait_queue_head_t *q) +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1920,7 +1923,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1935,7 +1938,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall sleep_on(wait_queue_head_t *q) +void fastcall __sched sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1948,7 +1951,7 @@ void fastcall sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(sleep_on); -long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1963,8 +1966,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); -void scheduling_functions_end_here(void) { } - void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -2424,7 +2425,7 @@ asmlinkage long sys_sched_yield(void) return 0; } -void __cond_resched(void) +void __sched __cond_resched(void) { set_current_state(TASK_RUNNING); schedule(); @@ -2438,7 +2439,7 @@ EXPORT_SYMBOL(__cond_resched); * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void yield(void) +void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); @@ -2453,7 +2454,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void io_schedule(void) +void __sched io_schedule(void) { struct runqueue *rq = this_rq(); @@ -2464,7 +2465,7 @@ void io_schedule(void) EXPORT_SYMBOL(io_schedule); -long io_schedule_timeout(long timeout) +long __sched io_schedule_timeout(long timeout) { struct runqueue *rq = this_rq(); long ret; @@ -2982,7 +2983,8 @@ void __might_sleep(char *file, int line) #if defined(in_atomic) static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && system_running) { + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; @@ -3009,7 +3011,7 @@ EXPORT_SYMBOL(__might_sleep); * * Called inside preempt_disable(). */ -void __preempt_spin_lock(spinlock_t *lock) +void __sched __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { _raw_spin_lock(lock); @@ -3025,7 +3027,7 @@ void __preempt_spin_lock(spinlock_t *lock) EXPORT_SYMBOL(__preempt_spin_lock); -void __preempt_write_lock(rwlock_t *lock) +void __sched __preempt_write_lock(rwlock_t *lock) { if (preempt_count() > 1) { _raw_write_lock(lock); diff --git a/kernel/signal.c b/kernel/signal.c index 32992a71683b..c69671600bef 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -352,6 +352,7 @@ void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); tsk->signal = NULL; + exit_itimers(sig); spin_unlock(&sighand->siglock); flush_sigqueue(&sig->shared_pending); kmem_cache_free(signal_cachep, sig); @@ -588,7 +589,8 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((!info || ((unsigned long)info != 1 && (unsigned long)info != 2 && SI_FROMUSER(info))) - && ((sig != SIGCONT) || (current->session != t->session)) + && ((sig != SIGCONT) || + (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) @@ -1103,7 +1105,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid) retval = -ESRCH; read_lock(&tasklist_lock); for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) { - if (!p->leader) + if (!p->signal->leader) continue; err = group_send_sig_info(sig, info, p); if (retval) @@ -2047,6 +2049,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_stime, &to->si_stime); break; case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); @@ -2553,4 +2556,3 @@ void __init signals_init(void) if (!sigqueue_cachep) panic("signals_init(): cannot create sigqueue SLAB cache"); } - diff --git a/kernel/softirq.c b/kernel/softirq.c index 81c79736ff9e..58c915c202ff 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/kthread.h> +#include <asm/irq.h> /* - No shared variables, all the data are CPU local. - If a softirq needs serialization, let it serialize itself @@ -69,53 +70,66 @@ static inline void wakeup_softirqd(void) */ #define MAX_SOFTIRQ_RESTART 10 -asmlinkage void do_softirq(void) +asmlinkage void __do_softirq(void) { - int max_restart = MAX_SOFTIRQ_RESTART; + struct softirq_action *h; __u32 pending; - unsigned long flags; + int max_restart = MAX_SOFTIRQ_RESTART; - if (in_interrupt()) - return; + pending = local_softirq_pending(); - local_irq_save(flags); + local_bh_disable(); +restart: + /* Reset the pending bitmask before enabling irqs */ + local_softirq_pending() = 0; + + local_irq_enable(); + + h = softirq_vec; + + do { + if (pending & 1) + h->action(h); + h++; + pending >>= 1; + } while (pending); + + local_irq_disable(); pending = local_softirq_pending(); + if (pending && --max_restart) + goto restart; - if (pending) { - struct softirq_action *h; + if (pending) + wakeup_softirqd(); - local_bh_disable(); -restart: - /* Reset the pending bitmask before enabling irqs */ - local_softirq_pending() = 0; + __local_bh_enable(); +} - local_irq_enable(); +#ifndef __ARCH_HAS_DO_SOFTIRQ + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; - h = softirq_vec; + if (in_interrupt()) + return; - do { - if (pending & 1) - h->action(h); - h++; - pending >>= 1; - } while (pending); + local_irq_save(flags); - local_irq_disable(); + pending = local_softirq_pending(); - pending = local_softirq_pending(); - if (pending && --max_restart) - goto restart; - if (pending) - wakeup_softirqd(); - __local_bh_enable(); - } + if (pending) + __do_softirq(); local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); +#endif + void local_bh_enable(void) { __local_bh_enable(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a2cae39d322c..9610403ce2cf 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -149,10 +149,10 @@ static int do_stop(void *_smdata) complete(&smdata->done); /* Wait for kthread_stop */ - __set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); - __set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return ret; diff --git a/kernel/sys.c b/kernel/sys.c index 33a14e13079e..4d414d925889 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -260,6 +260,17 @@ cond_syscall(sys_msgctl) cond_syscall(sys_shmget) cond_syscall(sys_shmdt) cond_syscall(sys_shmctl) +cond_syscall(sys_mq_open) +cond_syscall(sys_mq_unlink) +cond_syscall(sys_mq_timedsend) +cond_syscall(sys_mq_timedreceive) +cond_syscall(sys_mq_notify) +cond_syscall(sys_mq_getsetattr) +cond_syscall(compat_sys_mq_open) +cond_syscall(compat_sys_mq_timedsend) +cond_syscall(compat_sys_mq_timedreceive) +cond_syscall(compat_sys_mq_notify) +cond_syscall(compat_sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) @@ -436,7 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user switch (cmd) { case LINUX_REBOOT_CMD_RESTART: notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system.\n"); machine_restart(NULL); @@ -452,7 +463,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_HALT: notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -462,7 +473,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_POWER_OFF: notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -478,7 +489,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user buffer[sizeof(buffer) - 1] = '\0'; notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); machine_restart(buffer); @@ -979,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->parent == current || p->real_parent == current) { err = -EPERM; - if (p->session != current->session) + if (p->signal->session != current->signal->session) goto out; err = -EACCES; if (p->did_exec) @@ -991,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) } err = -EPERM; - if (p->leader) + if (p->signal->leader) goto out; if (pgid != pid) { @@ -1000,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct list_head *l; for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid) - if (p->session == current->session) + if (p->signal->session == current->signal->session) goto ok_pgid; goto out; } @@ -1012,7 +1023,7 @@ ok_pgid: if (process_group(p) != pgid) { detach_pid(p, PIDTYPE_PGID); - p->group_leader->__pgrp = pgid; + p->signal->pgrp = pgid; attach_pid(p, PIDTYPE_PGID, pgid); } @@ -1054,7 +1065,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) { - return current->session; + return current->signal->session; } else { int retval; struct task_struct *p; @@ -1066,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid) if(p) { retval = security_task_getsid(p); if (!retval) - retval = p->session; + retval = p->signal->session; } read_unlock(&tasklist_lock); return retval; @@ -1087,10 +1098,10 @@ asmlinkage long sys_setsid(void) if (pid) goto out; - current->leader = 1; + current->signal->leader = 1; __set_special_pids(current->pid, current->pid); - current->tty = NULL; - current->tty_old_pgrp = 0; + current->signal->tty = NULL; + current->signal->tty_old_pgrp = 0; err = process_group(current); out: write_unlock_irq(&tasklist_lock); @@ -1521,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) r.ru_nivcsw = p->nivcsw; r.ru_minflt = p->min_flt; r.ru_majflt = p->maj_flt; - r.ru_nswap = p->nswap; break; case RUSAGE_CHILDREN: jiffies_to_timeval(p->cutime, &r.ru_utime); @@ -1530,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) r.ru_nivcsw = p->cnivcsw; r.ru_minflt = p->cmin_flt; r.ru_majflt = p->cmaj_flt; - r.ru_nswap = p->cnswap; break; default: jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime); @@ -1539,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) r.ru_nivcsw = p->nivcsw + p->cnivcsw; r.ru_minflt = p->min_flt + p->cmin_flt; r.ru_majflt = p->maj_flt + p->cmaj_flt; - r.ru_nswap = p->nswap + p->cnswap; break; } return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f5f3123b0522..69e9123cdd0f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -710,10 +710,12 @@ static ctl_table vm_table[] = { { .ctl_name = VM_HUGETLB_PAGES, .procname = "nr_hugepages", - .data = &htlbpage_max, - .maxlen = sizeof(int), + .data = &max_huge_pages, + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, }, #endif { @@ -722,7 +724,7 @@ static ctl_table vm_table[] = { .data = &sysctl_lower_zone_protection, .maxlen = sizeof(sysctl_lower_zone_protection), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &lower_zone_protection_sysctl_handler, .strategy = &sysctl_intvec, .extra1 = &zero, }, @@ -744,6 +746,26 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = VM_LAPTOP_MODE, + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_BLOCK_DUMP, + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = 0 } }; diff --git a/kernel/time.c b/kernel/time.c index 33a6fe086304..142a4bd5771e 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -51,10 +51,11 @@ EXPORT_SYMBOL(sys_tz); asmlinkage long sys_time(int * tloc) { int i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; - /* SMP: This is fairly trivial. We grab CURRENT_TIME and - stuff it to user space. No side effects */ - i = get_seconds(); if (tloc) { if (put_user(i,tloc)) i = -EFAULT; diff --git a/kernel/timer.c b/kernel/timer.c index f53e0749b0d2..cbcb5522866d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data) * * In all cases the return value is guaranteed to be non-negative. */ -fastcall signed long schedule_timeout(signed long timeout) +fastcall signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; @@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void) return current->pid; } -static long nanosleep_restart(struct restart_block *restart) +static long __sched nanosleep_restart(struct restart_block *restart) { unsigned long expire = restart->arg0, now = jiffies; struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; |
