summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2004-04-12 02:21:00 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2004-04-12 02:21:00 -0700
commit0d61fc5ea78015def4d2fcf9b598ecfe25210cdd (patch)
treea45aa0f67c6bbd200dc2a328e560042810307b1b /kernel
parentf2eb250f07ba4695c2474cb8b6edbf64b5457d65 (diff)
parenteb880e5457f8b4a61ff7fd36d47dd14fe51cb030 (diff)
Merge bk://bk.arm.linux.org.uk/linux-2.6-rmk
into ppc970.osdl.org:/home/torvalds/v2.6/linux
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/audit.c825
-rw-r--r--kernel/auditsc.c922
-rw-r--r--kernel/exit.c24
-rw-r--r--kernel/fork.c38
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/posix-timers.c97
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/disk.c8
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/pmdisk.c5
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/swsusp.c282
-rw-r--r--kernel/printk.c3
-rw-r--r--kernel/sched.c44
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/softirq.c70
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c40
-rw-r--r--kernel/sysctl.c28
-rw-r--r--kernel/time.c7
-rw-r--r--kernel/timer.c4
26 files changed, 2232 insertions, 244 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 3a6484838748..238c65f60d9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,8 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKCONFIG_PROC) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9dbab88b2d31..555e1e3c349f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -347,7 +347,11 @@ static void do_acct_process(long exitcode, struct file *file)
/* we really need to bite the bullet and change layout */
ac.ac_uid = current->uid;
ac.ac_gid = current->gid;
- ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0;
+
+ read_lock(&tasklist_lock); /* pin current->signal */
+ ac.ac_tty = current->signal->tty ?
+ old_encode_dev(tty_devnum(current->signal->tty)) : 0;
+ read_unlock(&tasklist_lock);
ac.ac_flag = 0;
if (current->flags & PF_FORKNOEXEC)
@@ -376,7 +380,7 @@ static void do_acct_process(long exitcode, struct file *file)
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_minflt = encode_comp_t(current->min_flt);
ac.ac_majflt = encode_comp_t(current->maj_flt);
- ac.ac_swaps = encode_comp_t(current->nswap);
+ ac.ac_swaps = encode_comp_t(0);
ac.ac_exitcode = exitcode;
/*
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..765822b03b91
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,825 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ * 2) Minimal run-time overhead:
+ * a) Minimal when syscall auditing is disabled (audit_enable=0).
+ * b) Small when syscall auditing is enabled and no audit record
+ * is generated (defer as much work as possible to record
+ * generation time):
+ * i) context is allocated,
+ * ii) names from getname are stored without a copy, and
+ * iii) inode information stored from path_lookup.
+ * 3) Ability to disable syscall auditing at boot time (audit=0).
+ * 4) Usable by other parts of the kernel (if audit_log* is called,
+ * then a syscall record will be generated automatically for the
+ * current syscall).
+ * 5) Netlink interface to user-space.
+ * 6) Support low-overhead kernel-based filtering to minimize the
+ * information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second. This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int audit_backlog_limit = 64;
+static atomic_t audit_backlog = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+ 0) [suppressed in audit_alloc]
+ 1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+ 2) out of memory in audit_log_move [alloc_skb]
+ 3) suppressed due to audit_rate_limit
+ 4) suppressed due to audit_backlog_limit
+*/
+static atomic_t audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers. The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static spinlock_t audit_txlist_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t audit_freelist_lock = SPIN_LOCK_UNLOCKED;
+static int audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records. Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record. The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue. Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+ struct list_head list;
+ struct sk_buff_head sklist; /* formatted skbs ready to send */
+ struct audit_context *ctx; /* NULL or associated context */
+ int len; /* used area of tmp */
+ char tmp[AUDIT_BUFSIZ];
+
+ /* Pointer to header and contents */
+ struct nlmsghdr *nlh;
+ int total;
+ int type;
+ int pid;
+ int count; /* Times requeued */
+};
+
+struct audit_entry {
+ struct list_head list;
+ struct audit_rule rule;
+};
+
+static void audit_panic(const char *message)
+{
+ switch (audit_failure)
+ {
+ case AUDIT_FAIL_SILENT:
+ break;
+ case AUDIT_FAIL_PRINTK:
+ printk(KERN_ERR "audit: %s\n", message);
+ break;
+ case AUDIT_FAIL_PANIC:
+ panic(message);
+ break;
+ }
+}
+
+static inline int audit_rate_check(void)
+{
+ static unsigned long last_check = 0;
+ static int messages = 0;
+ static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ unsigned long flags;
+ unsigned long now;
+ unsigned long elapsed;
+ int retval = 0;
+
+ if (!audit_rate_limit) return 1;
+
+ spin_lock_irqsave(&lock, flags);
+ if (++messages < audit_rate_limit) {
+ retval = 1;
+ } else {
+ now = jiffies;
+ elapsed = now - last_check;
+ if (elapsed > HZ) {
+ last_check = now;
+ messages = 0;
+ retval = 1;
+ }
+ }
+ spin_unlock_irqrestore(&lock, flags);
+
+ return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+ static unsigned long last_msg = 0;
+ static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ unsigned long flags;
+ unsigned long now;
+ int print;
+
+ atomic_inc(&audit_lost);
+
+ print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+ if (!print) {
+ spin_lock_irqsave(&lock, flags);
+ now = jiffies;
+ if (now - last_msg > HZ) {
+ print = 1;
+ last_msg = now;
+ }
+ spin_unlock_irqrestore(&lock, flags);
+ }
+
+ if (print) {
+ printk(KERN_WARNING
+ "audit: audit_lost=%d audit_backlog=%d"
+ " audit_rate_limit=%d audit_backlog_limit=%d\n",
+ atomic_read(&audit_lost),
+ atomic_read(&audit_backlog),
+ audit_rate_limit,
+ audit_backlog_limit);
+ audit_panic(message);
+ }
+
+}
+
+int audit_set_rate_limit(int limit)
+{
+ int old = audit_rate_limit;
+ audit_rate_limit = limit;
+ audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+ audit_rate_limit, old);
+ return old;
+}
+
+int audit_set_backlog_limit(int limit)
+{
+ int old = audit_backlog_limit;
+ audit_backlog_limit = limit;
+ audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+ audit_backlog_limit, old);
+ return old;
+}
+
+int audit_set_enabled(int state)
+{
+ int old = audit_enabled;
+ if (state != 0 && state != 1)
+ return -EINVAL;
+ audit_enabled = state;
+ audit_log(current->audit_context, "audit_enabled=%d old=%d",
+ audit_enabled, old);
+ return old;
+}
+
+int audit_set_failure(int state)
+{
+ int old = audit_failure;
+ if (state != AUDIT_FAIL_SILENT
+ && state != AUDIT_FAIL_PRINTK
+ && state != AUDIT_FAIL_PANIC)
+ return -EINVAL;
+ audit_failure = state;
+ audit_log(current->audit_context, "audit_failure=%d old=%d",
+ audit_failure, old);
+ return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+ void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int len = NLMSG_SPACE(size);
+ void *data;
+ int flags = multi ? NLM_F_MULTI : 0;
+ int t = done ? NLMSG_DONE : type;
+
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ goto nlmsg_failure;
+
+ nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+ nlh->nlmsg_flags = flags;
+ data = NLMSG_DATA(nlh);
+ memcpy(data, payload, size);
+ netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+ return;
+
+nlmsg_failure: /* Used by NLMSG_PUT */
+ if (skb)
+ kfree_skb(skb);
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ u32 uid, pid, seq;
+ void *data;
+ struct audit_status *status_get, status_set;
+ struct audit_login *login;
+ int err = 0;
+ struct audit_buffer *ab;
+
+ pid = NETLINK_CREDS(skb)->pid;
+ uid = NETLINK_CREDS(skb)->uid;
+ seq = nlh->nlmsg_seq;
+ data = NLMSG_DATA(nlh);
+
+ switch (nlh->nlmsg_type) {
+ case AUDIT_GET:
+ status_set.enabled = audit_enabled;
+ status_set.failure = audit_failure;
+ status_set.pid = audit_pid;
+ status_set.rate_limit = audit_rate_limit;
+ status_set.backlog_limit = audit_backlog_limit;
+ status_set.lost = atomic_read(&audit_lost);
+ status_set.backlog = atomic_read(&audit_backlog);
+ audit_send_reply(pid, seq, AUDIT_GET, 0, 0,
+ &status_set, sizeof(status_set));
+ break;
+ case AUDIT_SET:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ status_get = (struct audit_status *)data;
+ if (status_get->mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(status_get->enabled);
+ if (err < 0) return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(status_get->failure);
+ if (err < 0) return err;
+ }
+ if (status_get->mask & AUDIT_STATUS_PID) {
+ int old = audit_pid;
+ audit_pid = status_get->pid;
+ audit_log(current->audit_context,
+ "audit_pid=%d old=%d", audit_pid, old);
+ }
+ if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+ audit_set_rate_limit(status_get->rate_limit);
+ if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+ audit_set_backlog_limit(status_get->backlog_limit);
+ break;
+ case AUDIT_USER:
+ ab = audit_log_start(NULL);
+ if (!ab)
+ break; /* audit_panic has been called */
+ audit_log_format(ab,
+ "user pid=%d uid=%d length=%d msg='%.1024s'",
+ pid, uid,
+ (int)(nlh->nlmsg_len
+ - ((char *)data - (char *)nlh)),
+ (char *)data);
+ ab->type = AUDIT_USER;
+ ab->pid = pid;
+ audit_log_end(ab);
+ break;
+ case AUDIT_LOGIN:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ login = (struct audit_login *)data;
+ ab = audit_log_start(NULL);
+ if (ab) {
+ audit_log_format(ab, "login pid=%d uid=%d loginuid=%d"
+ " length=%d msg='%.1024s'",
+ pid, uid,
+ login->loginuid,
+ login->msglen,
+ login->msg);
+ ab->type = AUDIT_LOGIN;
+ ab->pid = pid;
+ audit_log_end(ab);
+ }
+#ifdef CONFIG_AUDITSYSCALL
+ err = audit_set_loginuid(current->audit_context,
+ login->loginuid);
+#endif
+ break;
+ case AUDIT_LIST:
+ case AUDIT_ADD:
+ case AUDIT_DEL:
+#ifdef CONFIG_AUDITSYSCALL
+ err = audit_receive_filter(nlh->nlmsg_type, pid, uid, seq,
+ data);
+#else
+ err = -EOPNOTSUPP;
+#endif
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb). Each message is
+ * processed by audit_receive_msg. Malformed skbs with wrong length are
+ * discarded silently. */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+ u32 rlen;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return 0;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ if ((err = audit_receive_msg(skb, nlh))) {
+ netlink_ack(skb, nlh, -err);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+ return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+ struct sk_buff *skb;
+
+ if (down_trylock(&audit_netlink_sem))
+ return;
+
+ /* FIXME: this must not cause starvation */
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ if (audit_receive_skb(skb) && skb->len)
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ else
+ kfree_skb(skb);
+ }
+ up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb. This is an extra copy, and
+ * that is unfortunate. However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation. (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+ struct sk_buff *skb;
+ char *start;
+ int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+ skb = skb_peek(&ab->sklist);
+ if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+ skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+ if (!skb) {
+ ab->len = 0; /* Lose information in ab->tmp */
+ audit_log_lost("out of memory in audit_log_move");
+ return;
+ }
+ __skb_queue_tail(&ab->sklist, skb);
+ if (!ab->nlh)
+ ab->nlh = (struct nlmsghdr *)skb_put(skb,
+ NLMSG_SPACE(0));
+ }
+ start = skb_put(skb, ab->len);
+ memcpy(start, ab->tmp, ab->len);
+ ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&ab->sklist))) {
+ int retval = 0;
+
+ if (audit_pid) {
+ if (ab->nlh) {
+ ab->nlh->nlmsg_len = ab->total;
+ ab->nlh->nlmsg_type = ab->type;
+ ab->nlh->nlmsg_flags = 0;
+ ab->nlh->nlmsg_seq = 0;
+ ab->nlh->nlmsg_pid = ab->pid;
+ }
+ skb_get(skb); /* because netlink_* frees */
+ retval = netlink_unicast(audit_sock, skb, audit_pid,
+ MSG_DONTWAIT);
+ }
+ if (retval == -EAGAIN && ab->count < 5) {
+ ++ab->count;
+ audit_log_end_irq(ab);
+ return 1;
+ }
+ if (retval < 0) {
+ if (retval == -ECONNREFUSED) {
+ printk(KERN_ERR
+ "audit: *NO* daemon at audit_pid=%d\n",
+ audit_pid);
+ audit_pid = 0;
+ } else
+ audit_log_lost("netlink socket too busy");
+ }
+ if (!audit_pid) { /* No daemon */
+ int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+ int len = skb->len - offset;
+ printk(KERN_ERR "%*.*s\n",
+ len, len, skb->data + offset);
+ }
+ kfree_skb(skb);
+ ab->nlh = NULL;
+ }
+ return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+ printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+ if (!audit_sock)
+ audit_panic("cannot initialize netlink socket");
+
+ audit_initialized = 1;
+ audit_enabled = audit_default;
+ audit_log(NULL, "initialized");
+ return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+ printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+ ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+ return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+ printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+ audit_sock = NULL;
+ audit_pid = 0;
+
+ audit_initialized = 1;
+ audit_enabled = audit_default;
+ audit_log(NULL, "initialized");
+ return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+ audit_default = !!simple_strtol(str, NULL, 0);
+ printk(KERN_INFO "audit: %s%s\n",
+ audit_default ? "enabled" : "disabled",
+ audit_initialized ? "" : " (after initialization)");
+ if (audit_initialized)
+ audit_enabled = audit_default;
+ return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+ struct audit_buffer *ab = NULL;
+ unsigned long flags;
+ struct timespec t;
+ int serial = 0;
+
+ if (!audit_initialized)
+ return NULL;
+
+ if (audit_backlog_limit
+ && atomic_read(&audit_backlog) > audit_backlog_limit) {
+ if (audit_rate_check())
+ printk(KERN_WARNING
+ "audit: audit_backlog=%d > "
+ "audit_backlog_limit=%d\n",
+ atomic_read(&audit_backlog),
+ audit_backlog_limit);
+ audit_log_lost("backlog limit exceeded");
+ return NULL;
+ }
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (!list_empty(&audit_freelist)) {
+ ab = list_entry(audit_freelist.next,
+ struct audit_buffer, list);
+ list_del(&ab->list);
+ --audit_freelist_count;
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+ if (!ab)
+ ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+ if (!ab)
+ audit_log_lost("audit: out of memory in audit_log_start");
+ if (!ab)
+ return NULL;
+
+ atomic_inc(&audit_backlog);
+ skb_queue_head_init(&ab->sklist);
+
+ ab->ctx = ctx;
+ ab->len = 0;
+ ab->nlh = NULL;
+ ab->total = 0;
+ ab->type = AUDIT_KERNEL;
+ ab->pid = 0;
+ ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (ab->ctx)
+ audit_get_stamp(ab->ctx, &t, &serial);
+ else
+#endif
+ t = CURRENT_TIME;
+
+ audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+ t.tv_sec, t.tv_nsec/1000000, serial);
+ return ab;
+}
+
+
+/* Format an audit message into the audit buffer. If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time. Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+ va_list args)
+{
+ int len, avail;
+
+ if (!ab)
+ return;
+
+ avail = sizeof(ab->tmp) - ab->len;
+ if (avail <= 0) {
+ audit_log_move(ab);
+ avail = sizeof(ab->tmp) - ab->len;
+ }
+ len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ if (len >= avail) {
+ /* The printk buffer is 1024 bytes long, so if we get
+ * here and AUDIT_BUFSIZ is at least 1024, then we can
+ * log everything that printk could have logged. */
+ audit_log_move(ab);
+ avail = sizeof(ab->tmp) - ab->len;
+ len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ }
+ ab->len += (len < avail) ? len : avail;
+ ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer. All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!ab)
+ return;
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+ struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+ char *p;
+ int len, avail;
+
+ if (prefix) audit_log_format(ab, " %s", prefix);
+
+ if (ab->len > 128)
+ audit_log_move(ab);
+ avail = sizeof(ab->tmp) - ab->len;
+ p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+ if (p == ERR_PTR(-ENAMETOOLONG)) {
+ /* FIXME: can we save some information here? */
+ audit_log_format(ab, "<toolong>");
+ } else {
+ /* path isn't at start of buffer */
+ len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+ memmove(ab->tmp + ab->len, p, len);
+ ab->len += len;
+ ab->total += len;
+ }
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+ LIST_HEAD(list);
+ struct audit_buffer *ab;
+ unsigned long flags;
+
+ spin_lock_irqsave(&audit_txlist_lock, flags);
+ list_splice_init(&audit_txlist, &list);
+ spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+ while (!list_empty(&list)) {
+ ab = list_entry(list.next, struct audit_buffer, list);
+ list_del(&ab->list);
+ audit_log_end_fast(ab);
+ }
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context. May be called in
+ * any context. */
+void audit_log_end_irq(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ if (!ab)
+ return;
+ spin_lock_irqsave(&audit_txlist_lock, flags);
+ list_add_tail(&ab->list, &audit_txlist);
+ spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+ tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space. May not
+ * be called in an irq context. */
+void audit_log_end_fast(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ BUG_ON(in_irq());
+ if (!ab)
+ return;
+ if (!audit_rate_check()) {
+ audit_log_lost("rate limit exceeded");
+ } else {
+ audit_log_move(ab);
+ if (audit_log_drain(ab))
+ return;
+ }
+
+ atomic_dec(&audit_backlog);
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (++audit_freelist_count > AUDIT_MAXFREE)
+ kfree(ab);
+ else
+ list_add(&ab->list, &audit_freelist);
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context. (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+ if (in_irq())
+ audit_log_end_irq(ab);
+ else
+ audit_log_end_fast(ab);
+}
+
+/* Log an audit record. This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end. It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+ struct audit_buffer *ab;
+ va_list args;
+
+ ab = audit_log_start(ctx);
+ if (ab) {
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+ audit_log_end(ab);
+ }
+}
+
+EXPORT_SYMBOL_GPL(audit_set_rate_limit);
+EXPORT_SYMBOL_GPL(audit_set_backlog_limit);
+EXPORT_SYMBOL_GPL(audit_set_enabled);
+EXPORT_SYMBOL_GPL(audit_set_failure);
+
+EXPORT_SYMBOL_GPL(audit_log_start);
+EXPORT_SYMBOL_GPL(audit_log_format);
+EXPORT_SYMBOL_GPL(audit_log_end_irq);
+EXPORT_SYMBOL_GPL(audit_log_end_fast);
+EXPORT_SYMBOL_GPL(audit_log_end);
+EXPORT_SYMBOL_GPL(audit_log);
+EXPORT_SYMBOL_GPL(audit_log_d_path);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..342b57141fd9
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,922 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+ 1 = put_count checking
+ 2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES 20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+ a per-task filter. At syscall entry, the audit_state is augmented by
+ the syscall filter. */
+enum audit_state {
+ AUDIT_DISABLED, /* Do not create per-task audit_context.
+ * No syscall-specific audit records can
+ * be generated. */
+ AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
+ * but don't necessarily fill it in at
+ * syscall entry time (i.e., filter
+ * instead). */
+ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ * and always fill it in at syscall
+ * entry time. This makes a full
+ * syscall record available if some
+ * other part of the kernel decides it
+ * should be recorded. */
+ AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ * always fill it in at syscall entry
+ * time, and always write out the audit
+ * record at syscall exit time. */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+ const char *name;
+ unsigned long ino;
+ dev_t rdev;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state;
+ unsigned int serial; /* serial number for record */
+ struct timespec ctime; /* time of syscall entry */
+ uid_t loginuid; /* login uid (identity) */
+ int major; /* syscall number */
+ unsigned long argv[4]; /* syscall arguments */
+ int return_valid; /* return code is valid */
+ int return_code;/* syscall return code */
+ int auditable; /* 1 if record should be written */
+ int name_count;
+ struct audit_names names[AUDIT_NAMES];
+ struct audit_context *previous; /* For nested syscalls */
+
+ /* Save things to print about task_struct */
+ pid_t pid;
+ uid_t uid, euid, suid, fsuid;
+ gid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
+#endif
+};
+
+ /* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical. It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+ int i;
+
+ if (a->flags != b->flags)
+ return 1;
+
+ if (a->action != b->action)
+ return 1;
+
+ if (a->field_count != b->field_count)
+ return 1;
+
+ for (i = 0; i < a->field_count; i++) {
+ if (a->fields[i] != b->fields[i]
+ || a->values[i] != b->values[i])
+ return 1;
+ }
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (a->mask[i] != b->mask[i])
+ return 1;
+
+ return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ if (entry->rule.flags & AUDIT_PREPEND) {
+ entry->rule.flags &= ~AUDIT_PREPEND;
+ list_add_rcu(&entry->list, list);
+ } else {
+ list_add_tail_rcu(&entry->list, list);
+ }
+ return 0;
+}
+
+static void audit_free_rule(void *arg)
+{
+ kfree(arg);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+
+ /* Do not use the _rcu iterator here, since this is the only
+ * deletion routine. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(rule, &e->rule)) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule, e);
+ return 0;
+ }
+ }
+ return -EFAULT; /* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space. Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+ int i;
+
+ if (s->action != AUDIT_NEVER
+ && s->action != AUDIT_POSSIBLE
+ && s->action != AUDIT_ALWAYS)
+ return -1;
+ if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+ return -1;
+
+ d->flags = s->flags;
+ d->action = s->action;
+ d->field_count = s->field_count;
+ for (i = 0; i < d->field_count; i++) {
+ d->fields[i] = s->fields[i];
+ d->values[i] = s->values[i];
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+ return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+ u32 flags;
+ struct audit_entry *entry;
+ int err = 0;
+
+ switch (type) {
+ case AUDIT_LIST:
+ /* The *_rcu iterators not needed here because we are
+ always called with audit_netlink_sem held. */
+ list_for_each_entry(entry, &audit_tsklist, list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
+ list_for_each_entry(entry, &audit_entlist, list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
+ list_for_each_entry(entry, &audit_extlist, list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
+ audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+ break;
+ case AUDIT_ADD:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+ return -ENOMEM;
+ if (audit_copy_rule(&entry->rule, data)) {
+ kfree(entry);
+ return -EINVAL;
+ }
+ flags = entry->rule.flags;
+ if (!err && (flags & AUDIT_PER_TASK))
+ err = audit_add_rule(entry, &audit_tsklist);
+ if (!err && (flags & AUDIT_AT_ENTRY))
+ err = audit_add_rule(entry, &audit_entlist);
+ if (!err && (flags & AUDIT_AT_EXIT))
+ err = audit_add_rule(entry, &audit_extlist);
+ break;
+ case AUDIT_DEL:
+ flags =((struct audit_rule *)data)->flags;
+ if (!err && (flags & AUDIT_PER_TASK))
+ err = audit_del_rule(data, &audit_tsklist);
+ if (!err && (flags & AUDIT_AT_ENTRY))
+ err = audit_del_rule(data, &audit_entlist);
+ if (!err && (flags & AUDIT_AT_EXIT))
+ err = audit_del_rule(data, &audit_extlist);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule. Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+ struct audit_rule *rule,
+ struct audit_context *ctx,
+ enum audit_state *state)
+{
+ int i, j;
+
+ for (i = 0; i < rule->field_count; i++) {
+ u32 field = rule->fields[i] & ~AUDIT_NEGATE;
+ u32 value = rule->values[i];
+ int result = 0;
+
+ switch (field) {
+ case AUDIT_PID:
+ result = (tsk->pid == value);
+ break;
+ case AUDIT_UID:
+ result = (tsk->uid == value);
+ break;
+ case AUDIT_EUID:
+ result = (tsk->euid == value);
+ break;
+ case AUDIT_SUID:
+ result = (tsk->suid == value);
+ break;
+ case AUDIT_FSUID:
+ result = (tsk->fsuid == value);
+ break;
+ case AUDIT_GID:
+ result = (tsk->gid == value);
+ break;
+ case AUDIT_EGID:
+ result = (tsk->egid == value);
+ break;
+ case AUDIT_SGID:
+ result = (tsk->sgid == value);
+ break;
+ case AUDIT_FSGID:
+ result = (tsk->fsgid == value);
+ break;
+ case AUDIT_PERS:
+ result = (tsk->personality == value);
+ break;
+
+ case AUDIT_EXIT:
+ if (ctx && ctx->return_valid)
+ result = (ctx->return_code == value);
+ break;
+ case AUDIT_SUCCESS:
+ if (ctx && ctx->return_valid)
+ result = (ctx->return_code >= 0);
+ break;
+ case AUDIT_DEVMAJOR:
+ if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (MAJOR(ctx->names[j].rdev)==value) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_DEVMINOR:
+ if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (MINOR(ctx->names[j].rdev)==value) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_INODE:
+ if (ctx) {
+ for (j = 0; j < ctx->name_count; j++) {
+ if (MINOR(ctx->names[j].ino)==value) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_LOGINUID:
+ result = 0;
+ if (ctx)
+ result = (ctx->loginuid == value);
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ if (ctx)
+ result = (ctx->argv[field-AUDIT_ARG0]==value);
+ break;
+ }
+
+ if (rule->fields[i] & AUDIT_NEGATE)
+ result = !result;
+ if (!result)
+ return 0;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task. Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_tsklist, list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit
+ && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+ int return_valid,
+ int return_code)
+{
+ struct audit_context *context = tsk->audit_context;
+
+ if (likely(!context))
+ return NULL;
+ context->return_valid = return_valid;
+ context->return_code = return_code;
+
+ if (context->in_syscall && !context->auditable) {
+ enum audit_state state;
+ state = audit_filter_syscall(tsk, context, &audit_extlist);
+ if (state == AUDIT_RECORD_CONTEXT)
+ context->auditable = 1;
+ }
+
+ context->pid = tsk->pid;
+ context->uid = tsk->uid;
+ context->gid = tsk->gid;
+ context->euid = tsk->euid;
+ context->suid = tsk->suid;
+ context->fsuid = tsk->fsuid;
+ context->egid = tsk->egid;
+ context->sgid = tsk->sgid;
+ context->fsgid = tsk->fsgid;
+ context->personality = tsk->personality;
+ tsk->audit_context = NULL;
+ return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+ int i;
+
+#if AUDIT_DEBUG == 2
+ if (context->auditable
+ ||context->put_count + context->ino_count != context->name_count) {
+ printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+ " name_count=%d put_count=%d"
+ " ino_count=%d [NOT freeing]\n",
+ __LINE__,
+ context->serial, context->major, context->in_syscall,
+ context->name_count, context->put_count,
+ context->ino_count);
+ for (i = 0; i < context->name_count; i++)
+ printk(KERN_ERR "names[%d] = %p = %s\n", i,
+ context->names[i].name,
+ context->names[i].name);
+ dump_stack();
+ return;
+ }
+#endif
+#if AUDIT_DEBUG
+ context->put_count = 0;
+ context->ino_count = 0;
+#endif
+
+ for (i = 0; i < context->name_count; i++)
+ if (context->names[i].name)
+ __putname(context->names[i].name);
+ context->name_count = 0;
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+ enum audit_state state)
+{
+ uid_t loginuid = context->loginuid;
+
+ memset(context, 0, sizeof(*context));
+ context->state = state;
+ context->loginuid = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+ struct audit_context *context;
+
+ if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ return NULL;
+ audit_zero_context(context, state);
+ return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary. Doing so turns on system call auditing for the
+ * specified task. This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+ struct audit_context *context;
+ enum audit_state state;
+
+ if (likely(!audit_enabled))
+ return 0; /* Return if not auditing. */
+
+ state = audit_filter_task(tsk);
+ if (likely(state == AUDIT_DISABLED))
+ return 0;
+
+ if (!(context = audit_alloc_context(state))) {
+ audit_log_lost("out of memory in audit_alloc");
+ return -ENOMEM;
+ }
+
+ /* Preserve login uid */
+ context->loginuid = -1;
+ if (tsk->audit_context)
+ context->loginuid = tsk->audit_context->loginuid;
+
+ tsk->audit_context = context;
+ set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+ struct audit_context *previous;
+ int count = 0;
+
+ do {
+ previous = context->previous;
+ if (previous || (count && count < 10)) {
+ ++count;
+ printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+ " freeing multiple contexts (%d)\n",
+ context->serial, context->major,
+ context->name_count, count);
+ }
+ audit_free_names(context);
+ kfree(context);
+ context = previous;
+ } while (context);
+ if (count >= 10)
+ printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+ int i;
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context);
+ if (!ab)
+ return; /* audit_panic has been called */
+ audit_log_format(ab, "syscall=%d", context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid)
+ audit_log_format(ab, " exit=%u", context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+ " pid=%d loginuid=%d uid=%d gid=%d"
+ " euid=%d suid=%d fsuid=%d"
+ " egid=%d sgid=%d fsgid=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count,
+ context->pid,
+ context->loginuid,
+ context->uid,
+ context->gid,
+ context->euid, context->suid, context->fsuid,
+ context->egid, context->sgid, context->fsgid);
+ audit_log_end(ab);
+ for (i = 0; i < context->name_count; i++) {
+ ab = audit_log_start(context);
+ if (!ab)
+ continue; /* audit_panic has been called */
+ audit_log_format(ab, "item=%d", i);
+ if (context->names[i].name)
+ audit_log_format(ab, " name=%s",
+ context->names[i].name);
+ if (context->names[i].ino != (unsigned long)-1)
+ audit_log_format(ab, " inode=%lu",
+ context->names[i].ino);
+ /* FIXME: should use format_dev_t, but ab structure is
+ * opaque. */
+ if (context->names[i].rdev != -1)
+ audit_log_format(ab, " dev=%02x:%02x",
+ MAJOR(context->names[i].rdev),
+ MINOR(context->names[i].rdev));
+ audit_log_end(ab);
+ }
+}
+
+/* Free a per-task audit context. Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+ struct audit_context *context;
+
+ task_lock(tsk);
+ context = audit_get_context(tsk, 0, 0);
+ task_unlock(tsk);
+
+ if (likely(!context))
+ return;
+
+ /* Check for system calls that do not go through the exit
+ * function (e.g., exit_group), then free context block. */
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context);
+
+ audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record. Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces. The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record. The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit. However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+ static atomic_t serial = ATOMIC_INIT(0xffffff);
+ unsigned int a, b;
+
+ do {
+ a = atomic_read(&serial);
+ if (atomic_dec_and_test(&serial))
+ atomic_set(&serial, 0xffffff);
+ b = atomic_read(&serial);
+ } while (b != a - 1);
+
+ return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry. This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built. If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4)
+{
+ struct audit_context *context = tsk->audit_context;
+ enum audit_state state;
+
+ BUG_ON(!context);
+
+ /* This happens only on certain architectures that make system
+ * calls in kernel_thread via the entry.S interface, instead of
+ * with direct calls. (If you are porting to a new
+ * architecture, hitting this condition can indicate that you
+ * got the _exit/_leave calls backward in entry.S.)
+ *
+ * i386 no
+ * x86_64 no
+ * ppc64 yes (see arch/ppc64/kernel/misc.S)
+ *
+ * This also happens with vm86 emulation in a non-nested manner
+ * (entries without exits), so this case must be caught.
+ */
+ if (context->in_syscall) {
+ struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+ /* vm86 mode should only be entered once */
+ if (major == __NR_vm86 || major == __NR_vm86old)
+ return;
+#endif
+#if AUDIT_DEBUG
+ printk(KERN_ERR
+ "audit(:%d) pid=%d in syscall=%d;"
+ " entering syscall=%d\n",
+ context->serial, tsk->pid, context->major, major);
+#endif
+ newctx = audit_alloc_context(context->state);
+ if (newctx) {
+ newctx->previous = context;
+ context = newctx;
+ tsk->audit_context = newctx;
+ } else {
+ /* If we can't alloc a new context, the best we
+ * can do is to leak memory (any pending putname
+ * will be lost). The only other alternative is
+ * to abandon auditing. */
+ audit_zero_context(context, context->state);
+ }
+ }
+ BUG_ON(context->in_syscall || context->name_count);
+
+ if (!audit_enabled)
+ return;
+
+ context->major = major;
+ context->argv[0] = a1;
+ context->argv[1] = a2;
+ context->argv[2] = a3;
+ context->argv[3] = a4;
+
+ state = context->state;
+ if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+ state = audit_filter_syscall(tsk, context, &audit_entlist);
+ if (likely(state == AUDIT_DISABLED))
+ return;
+
+ context->serial = audit_serial();
+ context->ctime = CURRENT_TIME;
+ context->in_syscall = 1;
+ context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call. If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information. In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+ struct audit_context *context;
+
+ get_task_struct(tsk);
+ task_lock(tsk);
+ context = audit_get_context(tsk, 1, return_code);
+ task_unlock(tsk);
+
+ /* Not having a context here is ok, since the parent may have
+ * called __put_task_struct. */
+ if (likely(!context))
+ return;
+
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context);
+
+ context->in_syscall = 0;
+ context->auditable = 0;
+ if (context->previous) {
+ struct audit_context *new_context = context->previous;
+ context->previous = NULL;
+ audit_free_context(context);
+ tsk->audit_context = new_context;
+ } else {
+ audit_free_names(context);
+ audit_zero_context(context, context->state);
+ tsk->audit_context = context;
+ }
+ put_task_struct(tsk);
+}
+
+/* Add a name to the list. Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ BUG_ON(!context);
+ if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+ printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+ __FILE__, __LINE__, context->serial, name);
+ dump_stack();
+#endif
+ return;
+ }
+ BUG_ON(context->name_count >= AUDIT_NAMES);
+ context->names[context->name_count].name = name;
+ context->names[context->name_count].ino = (unsigned long)-1;
+ context->names[context->name_count].rdev = -1;
+ ++context->name_count;
+}
+
+/* Intercept a putname request. Called from
+ * include/linux/fs.h:putname(). If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ BUG_ON(!context);
+ if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+ printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ __FILE__, __LINE__, context->serial, name);
+ if (context->name_count) {
+ int i;
+ for (i = 0; i < context->name_count; i++)
+ printk(KERN_ERR "name[%d] = %p = %s\n", i,
+ context->names[i].name,
+ context->names[i].name);
+ }
+#endif
+ __putname(name);
+ }
+#if AUDIT_DEBUG
+ else {
+ ++context->put_count;
+ if (context->put_count > context->name_count) {
+ printk(KERN_ERR "%s:%d(:%d): major=%d"
+ " in_syscall=%d putname(%p) name_count=%d"
+ " put_count=%d\n",
+ __FILE__, __LINE__,
+ context->serial, context->major,
+ context->in_syscall, name, context->name_count,
+ context->put_count);
+ dump_stack();
+ }
+ }
+#endif
+}
+
+/* Store the inode and device from a lookup. Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, unsigned long ino, dev_t rdev)
+{
+ int idx;
+ struct audit_context *context = current->audit_context;
+
+ if (!context->in_syscall)
+ return;
+ if (context->name_count
+ && context->names[context->name_count-1].name
+ && context->names[context->name_count-1].name == name)
+ idx = context->name_count - 1;
+ else if (context->name_count > 1
+ && context->names[context->name_count-2].name
+ && context->names[context->name_count-2].name == name)
+ idx = context->name_count - 2;
+ else {
+ /* FIXME: how much do we care about inodes that have no
+ * associated name? */
+ if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+ return;
+ idx = context->name_count++;
+ context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+ ++context->ino_count;
+#endif
+ }
+ context->names[idx].ino = ino;
+ context->names[idx].rdev = rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+ struct timespec *t, int *serial)
+{
+ if (ctx) {
+ t->tv_sec = ctx->ctime.tv_sec;
+ t->tv_nsec = ctx->ctime.tv_nsec;
+ *serial = ctx->serial;
+ ctx->auditable = 1;
+ } else {
+ *t = CURRENT_TIME;
+ *serial = 0;
+ }
+}
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+ if (ctx) {
+ if (loginuid < 0)
+ return -EINVAL;
+ ctx->loginuid = loginuid;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(audit_alloc);
+EXPORT_SYMBOL_GPL(audit_free);
+EXPORT_SYMBOL_GPL(audit_syscall_entry);
+EXPORT_SYMBOL_GPL(audit_syscall_exit);
+EXPORT_SYMBOL_GPL(audit_getname);
+EXPORT_SYMBOL_GPL(audit_putname);
+EXPORT_SYMBOL_GPL(audit_inode);
diff --git a/kernel/exit.c b/kernel/exit.c
index 308f6959add6..0ec66729ead8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -92,7 +92,6 @@ repeat:
p->parent->cstime += p->stime + p->cstime;
p->parent->cmin_flt += p->min_flt + p->cmin_flt;
p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
- p->parent->cnswap += p->nswap + p->cnswap;
p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
sched_exit(p);
@@ -136,13 +135,13 @@ int session_of_pgrp(int pgrp)
read_lock(&tasklist_lock);
for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
- if (p->session > 0) {
- sid = p->session;
+ if (p->signal->session > 0) {
+ sid = p->signal->session;
goto out;
}
p = find_task_by_pid(pgrp);
if (p)
- sid = p->session;
+ sid = p->signal->session;
out:
read_unlock(&tasklist_lock);
@@ -170,7 +169,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
|| p->real_parent->pid == 1)
continue;
if (process_group(p->real_parent) != pgrp
- && p->real_parent->session == p->session) {
+ && p->real_parent->signal->session == p->signal->session) {
ret = 0;
break;
}
@@ -259,14 +258,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
{
struct task_struct *curr = current;
- if (curr->session != session) {
+ if (curr->signal->session != session) {
detach_pid(curr, PIDTYPE_SID);
- curr->session = session;
+ curr->signal->session = session;
attach_pid(curr, PIDTYPE_SID, session);
}
if (process_group(curr) != pgrp) {
detach_pid(curr, PIDTYPE_PGID);
- curr->group_leader->__pgrp = pgrp;
+ curr->signal->pgrp = pgrp;
attach_pid(curr, PIDTYPE_PGID, pgrp);
}
}
@@ -341,7 +340,7 @@ void daemonize(const char *name, ...)
exit_mm(current);
set_special_pids(1, 1);
- current->tty = NULL;
+ current->signal->tty = NULL;
/* Block and flush all signals */
sigfillset(&blocked);
@@ -564,7 +563,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
* outside, so the child pgrp is now orphaned.
*/
if ((process_group(p) != process_group(father)) &&
- (p->session == father->session)) {
+ (p->signal->session == father->signal->session)) {
int pgrp = process_group(p);
if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
@@ -675,7 +674,7 @@ static void exit_notify(struct task_struct *tsk)
t = tsk->real_parent;
if ((process_group(t) != process_group(tsk)) &&
- (t->session == tsk->session) &&
+ (t->signal->session == tsk->signal->session) &&
will_become_orphaned_pgrp(process_group(tsk), tsk) &&
has_stopped_jobs(process_group(tsk))) {
__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
@@ -777,10 +776,9 @@ asmlinkage NORET_TYPE void do_exit(long code)
__exit_files(tsk);
__exit_fs(tsk);
exit_namespace(tsk);
- exit_itimers(tsk);
exit_thread();
- if (tsk->leader)
+ if (tsk->signal->leader)
disassociate_ctty(1);
module_put(tsk->thread_info->exec_domain->module);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b17a249c50d..4f5b018777d8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -21,6 +21,7 @@
#include <linux/completion.h>
#include <linux/namespace.h>
#include <linux/personality.h>
+#include <linux/sem.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
@@ -31,6 +32,7 @@
#include <linux/futex.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
+#include <linux/audit.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -39,9 +41,6 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_sem(struct task_struct *tsk);
-
/* The idle threads do not count..
* Protected by write_lock_irq(&tasklist_lock)
*/
@@ -85,6 +84,8 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ if (unlikely(tsk->audit_context))
+ audit_free(tsk);
security_task_free(tsk);
free_uid(tsk->user);
put_group_info(tsk->group_info);
@@ -209,11 +210,14 @@ EXPORT_SYMBOL(autoremove_wake_function);
void __init fork_init(unsigned long mempages)
{
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN 0
+#endif
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
kmem_cache_create("task_struct",
- sizeof(struct task_struct),0,
- SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+ sizeof(struct task_struct),ARCH_MIN_TASKALIGN,
+ 0, NULL, NULL);
if (!task_struct_cachep)
panic("fork_init(): cannot create task_struct SLAB cache");
#endif
@@ -322,7 +326,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
/* insert tmp into the share list, just after mpnt */
down(&file->f_mapping->i_shared_sem);
- list_add_tail(&tmp->shared, &mpnt->shared);
+ list_add(&tmp->shared, &mpnt->shared);
up(&file->f_mapping->i_shared_sem);
}
@@ -512,7 +516,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
tsk->min_flt = tsk->maj_flt = 0;
tsk->cmin_flt = tsk->cmaj_flt = 0;
- tsk->nswap = tsk->cnswap = 0;
tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0;
tsk->mm = NULL;
@@ -812,6 +815,13 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
sig->group_stop_count = 0;
sig->curr_target = NULL;
init_sigpending(&sig->shared_pending);
+ INIT_LIST_HEAD(&sig->posix_timers);
+
+ sig->tty = current->signal->tty;
+ sig->pgrp = process_group(current);
+ sig->session = current->signal->session;
+ sig->leader = 0; /* session leadership doesn't inherit */
+ sig->tty_old_pgrp = 0;
return 0;
}
@@ -923,7 +933,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
- INIT_LIST_HEAD(&p->posix_timers);
init_waitqueue_head(&p->wait_chldexit);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
@@ -937,21 +946,22 @@ struct task_struct *copy_process(unsigned long clone_flags,
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
- p->leader = 0; /* session leadership doesn't inherit */
- p->tty_old_pgrp = 0;
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->lock_depth = -1; /* -1 = no lock */
p->start_time = get_jiffies_64();
p->security = NULL;
p->io_context = NULL;
+ p->audit_context = NULL;
retval = -ENOMEM;
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup;
+ if ((retval = audit_alloc(p)))
+ goto bad_fork_cleanup_security;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
- goto bad_fork_cleanup_security;
+ goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
@@ -1057,7 +1067,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
if (thread_group_leader(p)) {
attach_pid(p, PIDTYPE_TGID, p->tgid);
attach_pid(p, PIDTYPE_PGID, process_group(p));
- attach_pid(p, PIDTYPE_SID, p->session);
+ attach_pid(p, PIDTYPE_SID, p->signal->session);
if (p->pid)
__get_cpu_var(process_counts)++;
} else
@@ -1076,6 +1086,8 @@ bad_fork_cleanup_namespace:
exit_namespace(p);
bad_fork_cleanup_mm:
exit_mm(p);
+ if (p->active_mm)
+ mmdrop(p->active_mm);
bad_fork_cleanup_signal:
exit_signal(p);
bad_fork_cleanup_sighand:
@@ -1086,6 +1098,8 @@ bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
+bad_fork_cleanup_audit:
+ audit_free(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5261de82029b..0002fcd4c554 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
};
DECLARE_WORK(work, __call_usermodehelper, &sub_info);
- if (!system_running)
+ if (system_state != SYSTEM_RUNNING)
return -EBUSY;
if (path[0] == '\0')
diff --git a/kernel/module.c b/kernel/module.c
index 16587e133b1b..a472deef9bdf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -493,7 +493,6 @@ static inline int __try_stop_module(void *_sref)
}
/* Mark it as dying. */
- sref->mod->waiter = current;
sref->mod->state = MODULE_STATE_GOING;
return 0;
}
@@ -588,6 +587,9 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
}
}
+ /* Set this up before setting mod->state */
+ mod->waiter = current;
+
/* Stop the machine so refcounts can't move and disable module. */
ret = try_stop_module(mod, flags, &forced);
diff --git a/kernel/params.c b/kernel/params.c
index 4d9a71b743c5..59667bce9ce0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -96,6 +96,13 @@ static char *next_arg(char *args, char **param, char **val)
else {
args[equals] = '\0';
*val = args + equals + 1;
+
+ /* Don't include quotes in value. */
+ if (**val == '"') {
+ (*val)++;
+ if (args[i-1] == '"')
+ args[i-1] = '\0';
+ }
}
if (args[i]) {
diff --git a/kernel/pid.c b/kernel/pid.c
index 4c85144759c5..6ed44f56ca45 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread)
attach_pid(thread, PIDTYPE_PID, thread->pid);
attach_pid(thread, PIDTYPE_TGID, thread->tgid);
- attach_pid(thread, PIDTYPE_PGID, leader->__pgrp);
- attach_pid(thread, PIDTYPE_SID, thread->session);
+ attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+ attach_pid(thread, PIDTYPE_SID, thread->signal->session);
list_add_tail(&thread->tasks, &init_task.tasks);
attach_pid(leader, PIDTYPE_PID, leader->pid);
attach_pid(leader, PIDTYPE_TGID, leader->tgid);
- attach_pid(leader, PIDTYPE_PGID, leader->__pgrp);
- attach_pid(leader, PIDTYPE_SID, leader->session);
+ attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+ attach_pid(leader, PIDTYPE_SID, leader->signal->session);
}
/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 082693e383cf..3de4d0ae9d26 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -317,12 +317,21 @@ static void timer_notify_task(struct k_itimer *timr)
if (timr->it_incr)
timr->sigq->info.si_sys_private = ++timr->it_requeue_pending;
- if (timr->it_sigev_notify & SIGEV_THREAD_ID )
+ if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+ if (unlikely(timr->it_process->flags & PF_EXITING)) {
+ timr->it_sigev_notify = SIGEV_SIGNAL;
+ put_task_struct(timr->it_process);
+ timr->it_process = timr->it_process->group_leader;
+ goto group;
+ }
ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
timr->it_process);
- else
+ }
+ else {
+ group:
ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
timr->it_process);
+ }
if (ret) {
/*
* signal was not sent because of sig_ignor
@@ -352,7 +361,7 @@ static void posix_timer_fn(unsigned long __data)
static inline struct task_struct * good_sigevent(sigevent_t * event)
{
- struct task_struct *rtn = current;
+ struct task_struct *rtn = current->group_leader;
if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
@@ -395,11 +404,15 @@ static struct k_itimer * alloc_posix_timer(void)
static void release_posix_timer(struct k_itimer *tmr)
{
if (tmr->it_id != -1) {
- spin_lock_irq(&idr_lock);
+ unsigned long flags;
+ spin_lock_irqsave(&idr_lock, flags);
idr_remove(&posix_timers_id, tmr->it_id);
- spin_unlock_irq(&idr_lock);
+ spin_unlock_irqrestore(&idr_lock, flags);
}
sigqueue_free(tmr->sigq);
+ if (unlikely(tmr->it_process) &&
+ tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(tmr->it_process);
kmem_cache_free(posix_timers_cache, tmr);
}
@@ -414,6 +427,7 @@ sys_timer_create(clockid_t which_clock,
struct k_itimer *new_timer = NULL;
timer_t new_timer_id;
struct task_struct *process = 0;
+ unsigned long flags;
sigevent_t event;
if ((unsigned) which_clock >= MAX_CLOCKS ||
@@ -458,7 +472,7 @@ sys_timer_create(clockid_t which_clock,
* We may be setting up this process for another
* thread. It may be exiting. To catch this
* case the we check the PF_EXITING flag. If
- * the flag is not set, the task_lock will catch
+ * the flag is not set, the siglock will catch
* him before it is too late (in exit_itimers).
*
* The exec case is a bit more invloved but easy
@@ -469,13 +483,14 @@ sys_timer_create(clockid_t which_clock,
* for us to die which means we can finish this
* linkage with our last gasp. I.e. no code :)
*/
- task_lock(process);
+ spin_lock_irqsave(&process->sighand->siglock, flags);
if (!(process->flags & PF_EXITING)) {
list_add(&new_timer->list,
- &process->posix_timers);
- task_unlock(process);
+ &process->signal->posix_timers);
+ spin_unlock_irqrestore(&process->sighand->siglock, flags);
+ get_task_struct(process);
} else {
- task_unlock(process);
+ spin_unlock_irqrestore(&process->sighand->siglock, flags);
process = 0;
}
}
@@ -491,10 +506,10 @@ sys_timer_create(clockid_t which_clock,
new_timer->it_sigev_notify = SIGEV_SIGNAL;
new_timer->it_sigev_signo = SIGALRM;
new_timer->it_sigev_value.sival_int = new_timer->it_id;
- process = current;
- task_lock(process);
- list_add(&new_timer->list, &process->posix_timers);
- task_unlock(process);
+ process = current->group_leader;
+ spin_lock_irqsave(&process->sighand->siglock, flags);
+ list_add(&new_timer->list, &process->signal->posix_timers);
+ spin_unlock_irqrestore(&process->sighand->siglock, flags);
}
new_timer->it_clock = which_clock;
@@ -925,14 +940,18 @@ retry_delete:
#else
p_timer_del(&posix_clocks[timer->it_clock], timer);
#endif
- task_lock(timer->it_process);
+ spin_lock(&current->sighand->siglock);
list_del(&timer->list);
- task_unlock(timer->it_process);
+ spin_unlock(&current->sighand->siglock);
/*
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
+ if (timer->it_process) {
+ if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(timer->it_process);
timer->it_process = NULL;
+ }
unlock_timer(timer, flags);
release_posix_timer(timer);
return 0;
@@ -942,24 +961,50 @@ retry_delete:
*/
static inline void itimer_delete(struct k_itimer *timer)
{
- if (sys_timer_delete(timer->it_id))
- BUG();
+ unsigned long flags;
+
+#ifdef CONFIG_SMP
+ int error;
+retry_delete:
+#endif
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+ error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+
+ if (error == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+#else
+ p_timer_del(&posix_clocks[timer->it_clock], timer);
+#endif
+ list_del(&timer->list);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ if (timer->it_process) {
+ if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+ put_task_struct(timer->it_process);
+ timer->it_process = NULL;
+ }
+ unlock_timer(timer, flags);
+ release_posix_timer(timer);
}
+
/*
- * This is exported to exit and exec
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
*/
-void exit_itimers(struct task_struct *tsk)
+void exit_itimers(struct signal_struct *sig)
{
struct k_itimer *tmr;
- task_lock(tsk);
- while (!list_empty(&tsk->posix_timers)) {
- tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list);
- task_unlock(tsk);
+ while (!list_empty(&sig->posix_timers)) {
+ tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
itimer_delete(tmr);
- task_lock(tsk);
}
- task_unlock(tsk);
}
/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 033eea403f26..6bb62269f3eb 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -9,9 +9,9 @@ config PM
Power Management is most important for battery powered laptop
computers; if you have a laptop, check out the Linux Laptop home
- page on the WWW at
- <http://www.cs.utexas.edu/users/kharker/linux-laptop/> and the
- Battery Powered Linux mini-HOWTO, available from
+ page on the WWW at <http://www.linux-on-laptops.com/> or
+ Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+ and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
Note that, even if you say N here, Linux on the x86 architecture
@@ -44,7 +44,7 @@ config SOFTWARE_SUSPEND
config PM_DISK
bool "Suspend-to-Disk Support"
- depends on PM && SWAP
+ depends on PM && SWAP && X86 && !X86_64
---help---
Suspend-to-disk is a power management state in which the contents
of memory are stored on disk and the entire system is shut down or
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d4cf319b8e1..6abcf99b7ada 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -84,7 +84,6 @@ static void free_some_memory(void)
while (shrink_all_memory(10000))
printk(".");
printk("|\n");
- blk_run_queues();
}
@@ -285,11 +284,16 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
{
int error = 0;
int i;
+ int len;
+ char *p;
u32 mode = 0;
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
down(&pm_sem);
for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
- if (!strcmp(buf,pm_disk_modes[i])) {
+ if (!strncmp(buf, pm_disk_modes[i], len)) {
mode = i;
break;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fd212e7ecd9f..d582906fecc6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -218,10 +218,15 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
{
u32 state = PM_SUSPEND_STANDBY;
char ** s;
+ char *p;
int error;
+ int len;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
for (s = &pm_states[state]; *s; s++, state++) {
- if (!strcmp(buf,*s))
+ if (!strncmp(buf, *s, len))
break;
}
if (*s)
diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c
index 1dc29b53a25e..22855abbdd6e 100644
--- a/kernel/power/pmdisk.c
+++ b/kernel/power/pmdisk.c
@@ -35,7 +35,7 @@
#include "power.h"
-extern int pmdisk_arch_suspend(int resume);
+extern asmlinkage int pmdisk_arch_suspend(int resume);
#define __ADDRESS(x) ((unsigned long) phys_to_virt(x))
#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err)
static void wait_io(void)
{
- blk_run_queues();
while(atomic_read(&io_done))
io_schedule();
}
@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page)
if (rw == WRITE)
bio_set_pages_dirty(bio);
start_io();
- submit_bio(rw,bio);
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
wait_io();
Done:
bio_put(bio);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 15c1b340c2ed..8225457183ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,7 +30,8 @@ static inline int freezeable(struct task_struct * p)
if ((p == current) ||
(p->flags & PF_IOTHREAD) ||
(p->state == TASK_ZOMBIE) ||
- (p->state == TASK_DEAD))
+ (p->state == TASK_DEAD) ||
+ (p->state == TASK_STOPPED))
return 0;
return 1;
}
@@ -38,21 +39,19 @@ static inline int freezeable(struct task_struct * p)
/* Refrigerator is place where frozen processes are stored :-). */
void refrigerator(unsigned long flag)
{
- /* You need correct to work with real-time processes.
- OTOH, this way one process may see (via /proc/) some other
- process in stopped state (and thereby discovered we were
- suspended. We probably do not care.
- */
+ /* Hmm, should we be allowed to suspend when there are realtime
+ processes around? */
long save;
save = current->state;
- current->state = TASK_STOPPED;
+ current->state = TASK_UNINTERRUPTIBLE;
pr_debug("%s entered refrigerator\n", current->comm);
printk("=");
current->flags &= ~PF_FREEZE;
- if (flag)
- flush_signals(current); /* We have signaled a kernel thread, which isn't normal behaviour
- and that may lead to 100%CPU sucking because those threads
- just don't manage signals. */
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* We sent fake signal, clean it up */
+ spin_unlock_irq(&current->sighand->siglock);
+
current->flags |= PF_FROZEN;
while (current->flags & PF_FROZEN)
schedule();
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 20134ab8e0b2..8f78d6807576 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,11 @@
/*
- * linux/kernel/suspend.c
+ * linux/kernel/power/swsusp.c
*
* This file is to realize architecture-independent
* machine suspend feature using pretty near only high-level routines
*
* Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
*
* This file is released under the GPLv2.
*
@@ -61,6 +61,7 @@
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/console.h>
+#include <linux/highmem.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -74,11 +75,6 @@ unsigned char software_suspend_enabled = 0;
#define NORESUME 1
#define RESUME_SPECIFIED 2
-
-#define __ADDRESS(x) ((unsigned long) phys_to_virt(x))
-#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
-#define ADDRESS2(x) __ADDRESS(__pa(x)) /* Needed for x86-64 where some pages are in memory twice */
-
/* References to section boundaries */
extern char __nosave_begin, __nosave_end;
@@ -105,6 +101,10 @@ unsigned int nr_copy_pages __nosavedata = 0;
time of suspend, that must be freed. Second is "pagedir_nosave",
allocated at time of resume, that travels through memory not to
collide with anything.
+
+ Warning: this is even more evil than it seems. Pagedirs this file
+ talks about are completely different from page directories used by
+ MMU hardware.
*/
suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
static suspend_pagedir_t *pagedir_save;
@@ -139,15 +139,15 @@ static const char name_resume[] = "Resume Machine: ";
#define TEST_SWSUSP 0 /* Set to 1 to reboot instead of halt machine after suspension */
#ifdef DEBUG_DEFAULT
-# define PRINTK(f, a...) printk(f, ## a)
+# define PRINTK(f, a...) printk(f, ## a)
#else
-# define PRINTK(f, a...)
+# define PRINTK(f, a...) do { } while(0)
#endif
#ifdef DEBUG_SLOW
#define MDELAY(a) mdelay(a)
#else
-#define MDELAY(a)
+#define MDELAY(a) do { } while(0)
#endif
/*
@@ -225,6 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
static void read_swapfiles(void) /* This is called before saving image */
{
int i, len;
+ static char buff[sizeof(resume_file)], *sname;
len=strlen(resume_file);
root_swap = 0xFFFF;
@@ -243,8 +244,11 @@ static void read_swapfiles(void) /* This is called before saving image */
swapfile_used[i] = SWAPFILE_IGNORED;
} else {
/* we ignore all swap devices that are not the resume_file */
- if (1) {
-// FIXME if(resume_device == swap_info[i].swap_device) {
+ sname = d_path(swap_info[i].swap_file->f_dentry,
+ swap_info[i].swap_file->f_vfsmnt,
+ buff,
+ sizeof(buff));
+ if (!strcmp(sname, resume_file)) {
swapfile_used[i] = SWAPFILE_SUSPEND;
root_swap = i;
} else {
@@ -346,7 +350,7 @@ static int write_suspend_image(void)
cur = (void *) buffer;
if (fill_suspend_header(&cur->sh))
- panic("\nOut of memory while writing header");
+ BUG(); /* Not a BUG_ON(): we want fill_suspend_header to be called, always */
cur->link.next = prev;
@@ -362,73 +366,174 @@ static int write_suspend_image(void)
return 0;
}
-/* if pagedir_p != NULL it also copies the counted pages */
-static int count_and_copy_data_pages(struct pbe *pagedir_p)
-{
- int chunk_size;
- int nr_copy_pages = 0;
- int pfn;
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+ char *data;
struct page *page;
-
-#ifdef CONFIG_DISCONTIGMEM
- panic("Discontingmem not supported");
-#else
- BUG_ON (max_pfn != num_physpages);
-#endif
- for (pfn = 0; pfn < max_pfn; pfn++) {
+ struct highmem_page *next;
+};
+
+struct highmem_page *highmem_copy = NULL;
+
+static int save_highmem_zone(struct zone *zone)
+{
+ unsigned long zone_pfn;
+ for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+ struct page *page;
+ struct highmem_page *save;
+ void *kaddr;
+ unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+ int chunk_size;
+
+ if (!(pfn%1000))
+ printk(".");
+ if (!pfn_valid(pfn))
+ continue;
page = pfn_to_page(pfn);
- if (PageHighMem(page))
- panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
+ /*
+ * This condition results from rvmalloc() sans vmalloc_32()
+ * and architectural memory reservations. This should be
+ * corrected eventually when the cases giving rise to this
+ * are better understood.
+ */
+ if (PageReserved(page)) {
+ printk("highmem reserved page?!\n");
+ continue;
+ }
+ if ((chunk_size = is_head_of_free_region(page))) {
+ pfn += chunk_size - 1;
+ zone_pfn += chunk_size - 1;
+ continue;
+ }
+ save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+ if (!save)
+ return -ENOMEM;
+ save->next = highmem_copy;
+ save->page = page;
+ save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+ if (!save->data) {
+ kfree(save);
+ return -ENOMEM;
+ }
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(save->data, kaddr, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ highmem_copy = save;
+ }
+ return 0;
+}
- if (!PageReserved(page)) {
- if (PageNosave(page))
- continue;
+static int save_highmem(void)
+{
+ struct zone *zone;
+ int res = 0;
+ for_each_zone(zone) {
+ if (is_highmem(zone))
+ res = save_highmem_zone(zone);
+ if (res)
+ return res;
+ }
+ return 0;
+}
- if ((chunk_size=is_head_of_free_region(page))!=0) {
- pfn += chunk_size - 1;
- continue;
- }
- } else if (PageReserved(page)) {
- BUG_ON (PageNosave(page));
+static int restore_highmem(void)
+{
+ while (highmem_copy) {
+ struct highmem_page *save = highmem_copy;
+ void *kaddr;
+ highmem_copy = save->next;
+
+ kaddr = kmap_atomic(save->page, KM_USER0);
+ memcpy(kaddr, save->data, PAGE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+ free_page((long) save->data);
+ kfree(save);
+ }
+ return 0;
+}
+#endif
- /*
- * Just copy whole code segment. Hopefully it is not that big.
- */
- if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) &&
- (ADDRESS(pfn) < (unsigned long) ADDRESS2(&__nosave_end))) {
- PRINTK("[nosave %lx]", ADDRESS(pfn));
- continue;
- }
- /* Hmm, perhaps copying all reserved pages is not too healthy as they may contain
- critical bios data? */
- } else BUG();
+static int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+ unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+ return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
- nr_copy_pages++;
- if (pagedir_p) {
- pagedir_p->orig_address = ADDRESS(pfn);
- copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
- pagedir_p++;
+/* if *pagedir_p != NULL it also copies the counted pages */
+static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
+{
+ unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
+ struct pbe *pbe = *pagedir_p;
+ for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+ struct page *page;
+ unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+ if (!(pfn%1000))
+ printk(".");
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ BUG_ON(PageReserved(page) && PageNosave(page));
+ if (PageNosave(page))
+ continue;
+ if (PageReserved(page) && pfn_is_nosave(pfn)) {
+ PRINTK("[nosave pfn 0x%lx]", pfn);
+ continue;
+ }
+ if ((chunk_size = is_head_of_free_region(page))) {
+ pfn += chunk_size - 1;
+ zone_pfn += chunk_size - 1;
+ continue;
}
+ nr_copy_pages++;
+ if (!pbe)
+ continue;
+ pbe->orig_address = (long) page_address(page);
+ copy_page((void *)pbe->address, (void *)pbe->orig_address);
+ pbe++;
}
+ *pagedir_p = pbe;
return nr_copy_pages;
}
-static void free_suspend_pagedir(unsigned long this_pagedir)
+static int count_and_copy_data_pages(struct pbe *pagedir_p)
{
- struct page *page;
- int pfn;
- unsigned long this_pagedir_end = this_pagedir +
- (PAGE_SIZE << pagedir_order);
+ int nr_copy_pages = 0;
+ struct zone *zone;
+ for_each_zone(zone) {
+ if (!is_highmem(zone))
+ nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
+ }
+ return nr_copy_pages;
+}
- for(pfn = 0; pfn < num_physpages; pfn++) {
+static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
+{
+ unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
+ pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
+ pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
+ pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
+ for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+ struct page *page;
+ unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+ if (!pfn_valid(pfn))
+ continue;
page = pfn_to_page(pfn);
if (!TestClearPageNosave(page))
continue;
+ else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
+ continue;
+ __free_page(page);
+ }
+}
- if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
- continue; /* old pagedir gets freed in one */
-
- free_page(ADDRESS(pfn));
+static void free_suspend_pagedir(unsigned long this_pagedir)
+{
+ struct zone *zone;
+ for_each_zone(zone) {
+ if (!is_highmem(zone))
+ free_suspend_pagedir_zone(zone, this_pagedir);
}
free_pages(this_pagedir, pagedir_order);
}
@@ -443,7 +548,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
- if(!pagedir)
+ if (!pagedir)
return NULL;
page = virt_to_page(pagedir);
@@ -452,7 +557,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
while(nr_copy_pages--) {
p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
- if(!p->address) {
+ if (!p->address) {
free_suspend_pagedir((unsigned long) pagedir);
return NULL;
}
@@ -492,10 +597,19 @@ static int suspend_prepare_image(void)
struct sysinfo i;
unsigned int nr_needed_pages = 0;
- drain_local_pages();
-
pagedir_nosave = NULL;
- printk( "/critical section: Counting pages to copy" );
+ printk( "/critical section: ");
+#ifdef CONFIG_HIGHMEM
+ printk( "handling highmem" );
+ if (save_highmem()) {
+ printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
+ return -ENOMEM;
+ }
+ printk(", ");
+#endif
+
+ printk("counting pages to copy" );
+ drain_local_pages();
nr_copy_pages = count_and_copy_data_pages(NULL);
nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
@@ -504,23 +618,22 @@ static int suspend_prepare_image(void)
printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
name_suspend, nr_needed_pages-nr_free_pages());
root_swap = 0xFFFF;
- return 1;
+ return -ENOMEM;
}
si_swapinfo(&i); /* FIXME: si_swapinfo(&i) returns all swap devices information.
We should only consider resume_device. */
if (i.freeswap < nr_needed_pages) {
printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
name_suspend, nr_needed_pages-i.freeswap);
- return 1;
+ return -ENOSPC;
}
PRINTK( "Alloc pagedir\n" );
pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
- if(!pagedir_nosave) {
- /* Shouldn't happen */
- printk(KERN_CRIT "%sCouldn't allocate enough pages\n",name_suspend);
- panic("Really should not happen");
- return 1;
+ if (!pagedir_nosave) {
+ /* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
+ printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
+ return -ENOMEM;
}
nr_copy_pages_check = nr_copy_pages;
pagedir_order_check = pagedir_order;
@@ -603,21 +716,25 @@ asmlinkage void do_magic_resume_2(void)
PRINTK( "Freeing prev allocated pagedir\n" );
free_suspend_pagedir((unsigned long) pagedir_save);
+
+#ifdef CONFIG_HIGHMEM
+ printk( "Restoring highmem\n" );
+ restore_highmem();
+#endif
+ printk("done, devices\n");
+
device_power_up();
spin_unlock_irq(&suspend_pagedir_lock);
device_resume();
- acquire_console_sem();
- update_screen(fg_console); /* Hmm, is this the problem? */
- release_console_sem();
-
+ /* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
PRINTK( "Fixing swap signatures... " );
mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
PRINTK( "ok\n" );
#ifdef SUSPEND_CONSOLE
acquire_console_sem();
- update_screen(fg_console); /* Hmm, is this the problem? */
+ update_screen(fg_console);
release_console_sem();
#endif
}
@@ -707,11 +824,6 @@ int software_suspend(void)
free_some_memory();
- /* No need to invalidate any vfsmnt list --
- * they will be valid after resume, anyway.
- */
- blk_run_queues();
-
/* Save state of all device drivers, and stop them. */
if ((res = device_suspend(4))==0)
/* If stopping device drivers worked, we proceed basically into
diff --git a/kernel/printk.c b/kernel/printk.c
index a7be1f922f34..5f2b3c9bbd6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...)
log_level_unknown = 1;
}
- if (!cpu_online(smp_processor_id()) && !system_running) {
+ if (!cpu_online(smp_processor_id()) &&
+ system_state != SYSTEM_RUNNING) {
/*
* Some console drivers may assume that per-cpu resources have
* been allocated. So don't allow them to be called by this
diff --git a/kernel/sched.c b/kernel/sched.c
index d5f21712ffbb..c2d1c0317130 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+extern unsigned long __scheduling_functions_start_here;
+extern unsigned long __scheduling_functions_end_here;
+const unsigned long scheduling_functions_start_here =
+ (unsigned long)&__scheduling_functions_start_here;
+const unsigned long scheduling_functions_end_here =
+ (unsigned long)&__scheduling_functions_end_here;
+
/*
* Default context-switch locking:
*/
@@ -1587,12 +1594,10 @@ out:
rebalance_tick(rq, 0);
}
-void scheduling_functions_start_here(void) { }
-
/*
* schedule() is the main scheduler function.
*/
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
{
long *switch_count;
task_t *prev, *next;
@@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule);
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void preempt_schedule(void)
+asmlinkage void __sched preempt_schedule(void)
{
struct thread_info *ti = current_thread_info();
@@ -1842,7 +1847,6 @@ void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exc
__wake_up_common(q, mode, nr_exclusive, 0);
spin_unlock_irqrestore(&q->lock, flags);
}
-
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
void fastcall complete(struct completion *x)
@@ -1855,7 +1859,6 @@ void fastcall complete(struct completion *x)
1, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
-
EXPORT_SYMBOL(complete);
void fastcall complete_all(struct completion *x)
@@ -1868,8 +1871,9 @@ void fastcall complete_all(struct completion *x)
0, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
+EXPORT_SYMBOL(complete_all);
-void fastcall wait_for_completion(struct completion *x)
+void fastcall __sched wait_for_completion(struct completion *x)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
@@ -1889,7 +1893,6 @@ void fastcall wait_for_completion(struct completion *x)
x->done--;
spin_unlock_irq(&x->wait.lock);
}
-
EXPORT_SYMBOL(wait_for_completion);
#define SLEEP_ON_VAR \
@@ -1907,7 +1910,7 @@ EXPORT_SYMBOL(wait_for_completion);
__remove_wait_queue(q, &wait); \
spin_unlock_irqrestore(&q->lock, flags);
-void fastcall interruptible_sleep_on(wait_queue_head_t *q)
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
{
SLEEP_ON_VAR
@@ -1920,7 +1923,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q)
EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
@@ -1935,7 +1938,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-void fastcall sleep_on(wait_queue_head_t *q)
+void fastcall __sched sleep_on(wait_queue_head_t *q)
{
SLEEP_ON_VAR
@@ -1948,7 +1951,7 @@ void fastcall sleep_on(wait_queue_head_t *q)
EXPORT_SYMBOL(sleep_on);
-long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
@@ -1963,8 +1966,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
EXPORT_SYMBOL(sleep_on_timeout);
-void scheduling_functions_end_here(void) { }
-
void set_user_nice(task_t *p, long nice)
{
unsigned long flags;
@@ -2424,7 +2425,7 @@ asmlinkage long sys_sched_yield(void)
return 0;
}
-void __cond_resched(void)
+void __sched __cond_resched(void)
{
set_current_state(TASK_RUNNING);
schedule();
@@ -2438,7 +2439,7 @@ EXPORT_SYMBOL(__cond_resched);
* this is a shortcut for kernel-space yielding - it marks the
* thread runnable and calls sys_sched_yield().
*/
-void yield(void)
+void __sched yield(void)
{
set_current_state(TASK_RUNNING);
sys_sched_yield();
@@ -2453,7 +2454,7 @@ EXPORT_SYMBOL(yield);
* But don't do that if it is a deliberate, throttling IO wait (this task
* has set its backing_dev_info: the queue against which it should throttle)
*/
-void io_schedule(void)
+void __sched io_schedule(void)
{
struct runqueue *rq = this_rq();
@@ -2464,7 +2465,7 @@ void io_schedule(void)
EXPORT_SYMBOL(io_schedule);
-long io_schedule_timeout(long timeout)
+long __sched io_schedule_timeout(long timeout)
{
struct runqueue *rq = this_rq();
long ret;
@@ -2982,7 +2983,8 @@ void __might_sleep(char *file, int line)
#if defined(in_atomic)
static unsigned long prev_jiffy; /* ratelimiting */
- if ((in_atomic() || irqs_disabled()) && system_running) {
+ if ((in_atomic() || irqs_disabled()) &&
+ system_state == SYSTEM_RUNNING) {
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
@@ -3009,7 +3011,7 @@ EXPORT_SYMBOL(__might_sleep);
*
* Called inside preempt_disable().
*/
-void __preempt_spin_lock(spinlock_t *lock)
+void __sched __preempt_spin_lock(spinlock_t *lock)
{
if (preempt_count() > 1) {
_raw_spin_lock(lock);
@@ -3025,7 +3027,7 @@ void __preempt_spin_lock(spinlock_t *lock)
EXPORT_SYMBOL(__preempt_spin_lock);
-void __preempt_write_lock(rwlock_t *lock)
+void __sched __preempt_write_lock(rwlock_t *lock)
{
if (preempt_count() > 1) {
_raw_write_lock(lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 32992a71683b..c69671600bef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -352,6 +352,7 @@ void __exit_signal(struct task_struct *tsk)
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
tsk->signal = NULL;
+ exit_itimers(sig);
spin_unlock(&sighand->siglock);
flush_sigqueue(&sig->shared_pending);
kmem_cache_free(signal_cachep, sig);
@@ -588,7 +589,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
error = -EPERM;
if ((!info || ((unsigned long)info != 1 &&
(unsigned long)info != 2 && SI_FROMUSER(info)))
- && ((sig != SIGCONT) || (current->session != t->session))
+ && ((sig != SIGCONT) ||
+ (current->signal->session != t->signal->session))
&& (current->euid ^ t->suid) && (current->euid ^ t->uid)
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
&& !capable(CAP_KILL))
@@ -1103,7 +1105,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid)
retval = -ESRCH;
read_lock(&tasklist_lock);
for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) {
- if (!p->leader)
+ if (!p->signal->leader)
continue;
err = group_send_sig_info(sig, info, p);
if (retval)
@@ -2047,6 +2049,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_stime, &to->si_stime);
break;
case __SI_RT: /* This is not generated by the kernel as of now. */
+ case __SI_MESGQ: /* But this is */
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_int, &to->si_int);
@@ -2553,4 +2556,3 @@ void __init signals_init(void)
if (!sigqueue_cachep)
panic("signals_init(): cannot create sigqueue SLAB cache");
}
-
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81c79736ff9e..58c915c202ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
#include <linux/cpu.h>
#include <linux/kthread.h>
+#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
@@ -69,53 +70,66 @@ static inline void wakeup_softirqd(void)
*/
#define MAX_SOFTIRQ_RESTART 10
-asmlinkage void do_softirq(void)
+asmlinkage void __do_softirq(void)
{
- int max_restart = MAX_SOFTIRQ_RESTART;
+ struct softirq_action *h;
__u32 pending;
- unsigned long flags;
+ int max_restart = MAX_SOFTIRQ_RESTART;
- if (in_interrupt())
- return;
+ pending = local_softirq_pending();
- local_irq_save(flags);
+ local_bh_disable();
+restart:
+ /* Reset the pending bitmask before enabling irqs */
+ local_softirq_pending() = 0;
+
+ local_irq_enable();
+
+ h = softirq_vec;
+
+ do {
+ if (pending & 1)
+ h->action(h);
+ h++;
+ pending >>= 1;
+ } while (pending);
+
+ local_irq_disable();
pending = local_softirq_pending();
+ if (pending && --max_restart)
+ goto restart;
- if (pending) {
- struct softirq_action *h;
+ if (pending)
+ wakeup_softirqd();
- local_bh_disable();
-restart:
- /* Reset the pending bitmask before enabling irqs */
- local_softirq_pending() = 0;
+ __local_bh_enable();
+}
- local_irq_enable();
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
- h = softirq_vec;
+ if (in_interrupt())
+ return;
- do {
- if (pending & 1)
- h->action(h);
- h++;
- pending >>= 1;
- } while (pending);
+ local_irq_save(flags);
- local_irq_disable();
+ pending = local_softirq_pending();
- pending = local_softirq_pending();
- if (pending && --max_restart)
- goto restart;
- if (pending)
- wakeup_softirqd();
- __local_bh_enable();
- }
+ if (pending)
+ __do_softirq();
local_irq_restore(flags);
}
EXPORT_SYMBOL(do_softirq);
+#endif
+
void local_bh_enable(void)
{
__local_bh_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a2cae39d322c..9610403ce2cf 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -149,10 +149,10 @@ static int do_stop(void *_smdata)
complete(&smdata->done);
/* Wait for kthread_stop */
- __set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
schedule();
- __set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return ret;
diff --git a/kernel/sys.c b/kernel/sys.c
index 33a14e13079e..4d414d925889 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -260,6 +260,17 @@ cond_syscall(sys_msgctl)
cond_syscall(sys_shmget)
cond_syscall(sys_shmdt)
cond_syscall(sys_shmctl)
+cond_syscall(sys_mq_open)
+cond_syscall(sys_mq_unlink)
+cond_syscall(sys_mq_timedsend)
+cond_syscall(sys_mq_timedreceive)
+cond_syscall(sys_mq_notify)
+cond_syscall(sys_mq_getsetattr)
+cond_syscall(compat_sys_mq_open)
+cond_syscall(compat_sys_mq_timedsend)
+cond_syscall(compat_sys_mq_timedreceive)
+cond_syscall(compat_sys_mq_notify)
+cond_syscall(compat_sys_mq_getsetattr)
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read)
@@ -436,7 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
- system_running = 0;
+ system_state = SYSTEM_SHUTDOWN;
device_shutdown();
printk(KERN_EMERG "Restarting system.\n");
machine_restart(NULL);
@@ -452,7 +463,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
case LINUX_REBOOT_CMD_HALT:
notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
- system_running = 0;
+ system_state = SYSTEM_SHUTDOWN;
device_shutdown();
printk(KERN_EMERG "System halted.\n");
machine_halt();
@@ -462,7 +473,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
case LINUX_REBOOT_CMD_POWER_OFF:
notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
- system_running = 0;
+ system_state = SYSTEM_SHUTDOWN;
device_shutdown();
printk(KERN_EMERG "Power down.\n");
machine_power_off();
@@ -478,7 +489,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
buffer[sizeof(buffer) - 1] = '\0';
notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
- system_running = 0;
+ system_state = SYSTEM_SHUTDOWN;
device_shutdown();
printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
machine_restart(buffer);
@@ -979,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (p->parent == current || p->real_parent == current) {
err = -EPERM;
- if (p->session != current->session)
+ if (p->signal->session != current->signal->session)
goto out;
err = -EACCES;
if (p->did_exec)
@@ -991,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
}
err = -EPERM;
- if (p->leader)
+ if (p->signal->leader)
goto out;
if (pgid != pid) {
@@ -1000,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
struct list_head *l;
for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid)
- if (p->session == current->session)
+ if (p->signal->session == current->signal->session)
goto ok_pgid;
goto out;
}
@@ -1012,7 +1023,7 @@ ok_pgid:
if (process_group(p) != pgid) {
detach_pid(p, PIDTYPE_PGID);
- p->group_leader->__pgrp = pgid;
+ p->signal->pgrp = pgid;
attach_pid(p, PIDTYPE_PGID, pgid);
}
@@ -1054,7 +1065,7 @@ asmlinkage long sys_getpgrp(void)
asmlinkage long sys_getsid(pid_t pid)
{
if (!pid) {
- return current->session;
+ return current->signal->session;
} else {
int retval;
struct task_struct *p;
@@ -1066,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid)
if(p) {
retval = security_task_getsid(p);
if (!retval)
- retval = p->session;
+ retval = p->signal->session;
}
read_unlock(&tasklist_lock);
return retval;
@@ -1087,10 +1098,10 @@ asmlinkage long sys_setsid(void)
if (pid)
goto out;
- current->leader = 1;
+ current->signal->leader = 1;
__set_special_pids(current->pid, current->pid);
- current->tty = NULL;
- current->tty_old_pgrp = 0;
+ current->signal->tty = NULL;
+ current->signal->tty_old_pgrp = 0;
err = process_group(current);
out:
write_unlock_irq(&tasklist_lock);
@@ -1521,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
r.ru_nivcsw = p->nivcsw;
r.ru_minflt = p->min_flt;
r.ru_majflt = p->maj_flt;
- r.ru_nswap = p->nswap;
break;
case RUSAGE_CHILDREN:
jiffies_to_timeval(p->cutime, &r.ru_utime);
@@ -1530,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
r.ru_nivcsw = p->cnivcsw;
r.ru_minflt = p->cmin_flt;
r.ru_majflt = p->cmaj_flt;
- r.ru_nswap = p->cnswap;
break;
default:
jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime);
@@ -1539,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
r.ru_nivcsw = p->nivcsw + p->cnivcsw;
r.ru_minflt = p->min_flt + p->cmin_flt;
r.ru_majflt = p->maj_flt + p->cmaj_flt;
- r.ru_nswap = p->nswap + p->cnswap;
break;
}
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5f3123b0522..69e9123cdd0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -710,10 +710,12 @@ static ctl_table vm_table[] = {
{
.ctl_name = VM_HUGETLB_PAGES,
.procname = "nr_hugepages",
- .data = &htlbpage_max,
- .maxlen = sizeof(int),
+ .data = &max_huge_pages,
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_sysctl_handler,
+ .extra1 = (void *)&hugetlb_zero,
+ .extra2 = (void *)&hugetlb_infinity,
},
#endif
{
@@ -722,7 +724,7 @@ static ctl_table vm_table[] = {
.data = &sysctl_lower_zone_protection,
.maxlen = sizeof(sysctl_lower_zone_protection),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &lower_zone_protection_sysctl_handler,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
@@ -744,6 +746,26 @@ static ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
+ {
+ .ctl_name = VM_LAPTOP_MODE,
+ .procname = "laptop_mode",
+ .data = &laptop_mode,
+ .maxlen = sizeof(laptop_mode),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+ {
+ .ctl_name = VM_BLOCK_DUMP,
+ .procname = "block_dump",
+ .data = &block_dump,
+ .maxlen = sizeof(block_dump),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
{ .ctl_name = 0 }
};
diff --git a/kernel/time.c b/kernel/time.c
index 33a6fe086304..142a4bd5771e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -51,10 +51,11 @@ EXPORT_SYMBOL(sys_tz);
asmlinkage long sys_time(int * tloc)
{
int i;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ i = tv.tv_sec;
- /* SMP: This is fairly trivial. We grab CURRENT_TIME and
- stuff it to user space. No side effects */
- i = get_seconds();
if (tloc) {
if (put_user(i,tloc))
i = -EFAULT;
diff --git a/kernel/timer.c b/kernel/timer.c
index f53e0749b0d2..cbcb5522866d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data)
*
* In all cases the return value is guaranteed to be non-negative.
*/
-fastcall signed long schedule_timeout(signed long timeout)
+fastcall signed long __sched schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
@@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void)
return current->pid;
}
-static long nanosleep_restart(struct restart_block *restart)
+static long __sched nanosleep_restart(struct restart_block *restart)
{
unsigned long expire = restart->arg0, now = jiffies;
struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;