Merge bk://bk.arm.linux.org.uk/linux-2.6-rmk

into ppc970.osdl.org:/home/torvalds/v2.6/linux
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2004-04-12 02:21:00 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2004-04-12 02:21:00 -0700
commit: 0d61fc5ea78015def4d2fcf9b598ecfe25210cdd (patch)
tree: a45aa0f67c6bbd200dc2a328e560042810307b1b /kernel
parent: f2eb250f07ba4695c2474cb8b6edbf64b5457d65 (diff)
parent: eb880e5457f8b4a61ff7fd36d47dd14fe51cb030 (diff)
26 files changed, 2232 insertions, 244 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 3a6484838748..238c65f60d9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,8 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9dbab88b2d31..555e1e3c349f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -347,7 +347,11 @@ static void do_acct_process(long exitcode, struct file *file)
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
-	ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0;
+
+	read_lock(&tasklist_lock);	/* pin current->signal */
+	ac.ac_tty = current->signal->tty ?
+		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
+	read_unlock(&tasklist_lock);
 
 	ac.ac_flag = 0;
 	if (current->flags & PF_FORKNOEXEC)
@@ -376,7 +380,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_minflt = encode_comp_t(current->min_flt);
 	ac.ac_majflt = encode_comp_t(current->maj_flt);
-	ac.ac_swaps = encode_comp_t(current->nswap);
+	ac.ac_swaps = encode_comp_t(0);
 	ac.ac_exitcode = exitcode;
 
 	/*
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..765822b03b91
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,825 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ *	  2) Minimal run-time overhead:
+ *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
+ *	     b) Small when syscall auditing is enabled and no audit record
+ *		is generated (defer as much work as possible to record
+ *		generation time):
+ *		i) context is allocated,
+ *		ii) names from getname are stored without a copy, and
+ *		iii) inode information stored from path_lookup.
+ *	  3) Ability to disable syscall auditing at boot time (audit=0).
+ *	  4) Usable by other parts of the kernel (if audit_log* is called,
+ *	     then a syscall record will be generated automatically for the
+ *	     current syscall).
+ *	  5) Netlink interface to user-space.
+ *	  6) Support low-overhead kernel-based filtering to minimize the
+ *	     information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int	audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int		audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int	audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int	audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int	audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second.  This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int	audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int	audit_backlog_limit = 64;
+static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+   0) [suppressed in audit_alloc]
+   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+   2) out of memory in audit_log_move [alloc_skb]
+   3) suppressed due to audit_rate_limit
+   4) suppressed due to audit_backlog_limit
+*/
+static atomic_t    audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers.  The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static spinlock_t  audit_txlist_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t  audit_freelist_lock = SPIN_LOCK_UNLOCKED;
+static int	   audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records.  Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE  (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record.  The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue.  Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+	struct list_head     list;
+	struct sk_buff_head  sklist;	/* formatted skbs ready to send */
+	struct audit_context *ctx;	/* NULL or associated context */
+	int		     len;	/* used area of tmp */
+	char		     tmp[AUDIT_BUFSIZ];
+
+				/* Pointer to header and contents */
+	struct nlmsghdr      *nlh;
+	int		     total;
+	int		     type;
+	int		     pid;
+	int		     count; /* Times requeued */
+};
+
+struct audit_entry {
+	struct list_head  list;
+	struct audit_rule rule;
+};
+
+static void audit_panic(const char *message)
+{
+	switch (audit_failure)
+	{
+	case AUDIT_FAIL_SILENT:
+		break;
+	case AUDIT_FAIL_PRINTK:
+		printk(KERN_ERR "audit: %s\n", message);
+		break;
+	case AUDIT_FAIL_PANIC:
+		panic(message);
+		break;
+	}
+}
+
+static inline int audit_rate_check(void)
+{
+	static unsigned long	last_check = 0;
+	static int		messages   = 0;
+	static spinlock_t	lock	   = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	unsigned long		elapsed;
+	int			retval	   = 0;
+
+	if (!audit_rate_limit) return 1;
+
+	spin_lock_irqsave(&lock, flags);
+	if (++messages < audit_rate_limit) {
+		retval = 1;
+	} else {
+		now     = jiffies;
+		elapsed = now - last_check;
+		if (elapsed > HZ) {
+			last_check = now;
+			messages   = 0;
+			retval     = 1;
+		}
+	}
+	spin_unlock_irqrestore(&lock, flags);
+
+	return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+	static unsigned long	last_msg = 0;
+	static spinlock_t	lock     = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	int			print;
+
+	atomic_inc(&audit_lost);
+
+	print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+	if (!print) {
+		spin_lock_irqsave(&lock, flags);
+		now = jiffies;
+		if (now - last_msg > HZ) {
+			print = 1;
+			last_msg = now;
+		}
+		spin_unlock_irqrestore(&lock, flags);
+	}
+
+	if (print) {
+		printk(KERN_WARNING
+		       "audit: audit_lost=%d audit_backlog=%d"
+		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       atomic_read(&audit_lost),
+		       atomic_read(&audit_backlog),
+		       audit_rate_limit,
+		       audit_backlog_limit);
+		audit_panic(message);
+	}
+
+}
+
+int audit_set_rate_limit(int limit)
+{
+	int old		 = audit_rate_limit;
+	audit_rate_limit = limit;
+	audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+		  audit_rate_limit, old);
+	return old;
+}
+
+int audit_set_backlog_limit(int limit)
+{
+	int old		 = audit_backlog_limit;
+	audit_backlog_limit = limit;
+	audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+		  audit_backlog_limit, old);
+	return old;
+}
+
+int audit_set_enabled(int state)
+{
+	int old		 = audit_enabled;
+	if (state != 0 && state != 1)
+		return -EINVAL;
+	audit_enabled = state;
+	audit_log(current->audit_context, "audit_enabled=%d old=%d",
+		  audit_enabled, old);
+	return old;
+}
+
+int audit_set_failure(int state)
+{
+	int old		 = audit_failure;
+	if (state != AUDIT_FAIL_SILENT
+	    && state != AUDIT_FAIL_PRINTK
+	    && state != AUDIT_FAIL_PANIC)
+		return -EINVAL;
+	audit_failure = state;
+	audit_log(current->audit_context, "audit_failure=%d old=%d",
+		  audit_failure, old);
+	return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+		      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nlh;
+	int		len = NLMSG_SPACE(size);
+	void		*data;
+	int		flags = multi ? NLM_F_MULTI : 0;
+	int		t     = done  ? NLMSG_DONE  : type;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		goto nlmsg_failure;
+
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+	nlh->nlmsg_flags = flags;
+	data		 = NLMSG_DATA(nlh);
+	memcpy(data, payload, size);
+	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+	return;
+
+nlmsg_failure:			/* Used by NLMSG_PUT */
+	if (skb)
+		kfree_skb(skb);
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32			uid, pid, seq;
+	void			*data;
+	struct audit_status	*status_get, status_set;
+	struct audit_login	*login;
+	int			err = 0;
+	struct audit_buffer	*ab;
+
+	pid  = NETLINK_CREDS(skb)->pid;
+	uid  = NETLINK_CREDS(skb)->uid;
+	seq  = nlh->nlmsg_seq;
+	data = NLMSG_DATA(nlh);
+
+	switch (nlh->nlmsg_type) {
+	case AUDIT_GET:
+		status_set.enabled	 = audit_enabled;
+		status_set.failure	 = audit_failure;
+		status_set.pid		 = audit_pid;
+		status_set.rate_limit	 = audit_rate_limit;
+		status_set.backlog_limit = audit_backlog_limit;
+		status_set.lost		 = atomic_read(&audit_lost);
+		status_set.backlog	 = atomic_read(&audit_backlog);
+		audit_send_reply(pid, seq, AUDIT_GET, 0, 0,
+				 &status_set, sizeof(status_set));
+		break;
+	case AUDIT_SET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		status_get   = (struct audit_status *)data;
+		if (status_get->mask & AUDIT_STATUS_ENABLED) {
+			err = audit_set_enabled(status_get->enabled);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_FAILURE) {
+			err = audit_set_failure(status_get->failure);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_PID) {
+			int old   = audit_pid;
+			audit_pid = status_get->pid;
+			audit_log(current->audit_context,
+				  "audit_pid=%d old=%d", audit_pid, old);
+		}
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+			audit_set_rate_limit(status_get->rate_limit);
+		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+			audit_set_backlog_limit(status_get->backlog_limit);
+		break;
+	case AUDIT_USER:
+		ab = audit_log_start(NULL);
+		if (!ab)
+			break;	/* audit_panic has been called */
+		audit_log_format(ab,
+				 "user pid=%d uid=%d length=%d msg='%.1024s'",
+				 pid, uid,
+				 (int)(nlh->nlmsg_len
+				       - ((char *)data - (char *)nlh)),
+				 (char *)data);
+		ab->type = AUDIT_USER;
+		ab->pid  = pid;
+		audit_log_end(ab);
+		break;
+	case AUDIT_LOGIN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		login = (struct audit_login *)data;
+		ab = audit_log_start(NULL);
+		if (ab) {
+			audit_log_format(ab, "login pid=%d uid=%d loginuid=%d"
+					 " length=%d msg='%.1024s'",
+					 pid, uid,
+					 login->loginuid,
+					 login->msglen,
+					 login->msg);
+			ab->type = AUDIT_LOGIN;
+			ab->pid  = pid;
+			audit_log_end(ab);
+		}
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_set_loginuid(current->audit_context,
+					 login->loginuid);
+#endif
+		break;
+	case AUDIT_LIST:
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_receive_filter(nlh->nlmsg_type, pid, uid, seq,
+					   data);
+#else
+		err = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * discarded silently.  */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+	int		err;
+	struct nlmsghdr	*nlh;
+	u32		rlen;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return 0;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if ((err = audit_receive_msg(skb, nlh))) {
+			netlink_ack(skb, nlh, -err);
+		} else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+	return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+	struct sk_buff  *skb;
+
+	if (down_trylock(&audit_netlink_sem))
+		return;
+
+				/* FIXME: this must not cause starvation */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+		if (audit_receive_skb(skb) && skb->len)
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		else
+			kfree_skb(skb);
+	}
+	up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb.  This is an extra copy, and
+ * that is unfortunate.  However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation.  (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	struct sk_buff	*skb;
+	char		*start;
+	int		extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+	skb = skb_peek(&ab->sklist);
+	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+		if (!skb) {
+			ab->len = 0; /* Lose information in ab->tmp */
+			audit_log_lost("out of memory in audit_log_move");
+			return;
+		}
+		__skb_queue_tail(&ab->sklist, skb);
+		if (!ab->nlh)
+			ab->nlh = (struct nlmsghdr *)skb_put(skb,
+							     NLMSG_SPACE(0));
+	}
+	start = skb_put(skb, ab->len);
+	memcpy(start, ab->tmp, ab->len);
+	ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ab->sklist))) {
+		int retval = 0;
+
+		if (audit_pid) {
+			if (ab->nlh) {
+				ab->nlh->nlmsg_len   = ab->total;
+				ab->nlh->nlmsg_type  = ab->type;
+				ab->nlh->nlmsg_flags = 0;
+				ab->nlh->nlmsg_seq   = 0;
+				ab->nlh->nlmsg_pid   = ab->pid;
+			}
+			skb_get(skb); /* because netlink_* frees */
+			retval = netlink_unicast(audit_sock, skb, audit_pid,
+						 MSG_DONTWAIT);
+		}
+		if (retval == -EAGAIN && ab->count < 5) {
+			++ab->count;
+			audit_log_end_irq(ab);
+			return 1;
+		}
+		if (retval < 0) {
+			if (retval == -ECONNREFUSED) {
+				printk(KERN_ERR
+				       "audit: *NO* daemon at audit_pid=%d\n",
+				       audit_pid);
+				audit_pid = 0;
+			} else
+				audit_log_lost("netlink socket too busy");
+		}
+		if (!audit_pid) { /* No daemon */
+			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+			int len    = skb->len - offset;
+			printk(KERN_ERR "%*.*s\n",
+			       len, len, skb->data + offset);
+		}
+		kfree_skb(skb);
+		ab->nlh = NULL;
+	}
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+	       audit_default ? "enabled" : "disabled");
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	if (!audit_sock)
+		audit_panic("cannot initialize netlink socket");
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs.  For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+	ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+	audit_sock = NULL;
+	audit_pid  = 0;
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+	audit_default = !!simple_strtol(str, NULL, 0);
+	printk(KERN_INFO "audit: %s%s\n",
+	       audit_default ? "enabled" : "disabled",
+	       audit_initialized ? "" : " (after initialization)");
+	if (audit_initialized)
+		audit_enabled = audit_default;
+	return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+	struct audit_buffer	*ab	= NULL;
+	unsigned long		flags;
+	struct timespec		t;
+	int			serial	= 0;
+
+	if (!audit_initialized)
+		return NULL;
+
+	if (audit_backlog_limit
+	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       atomic_read(&audit_backlog),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		return NULL;
+	}
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab)
+		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+	if (!ab)
+		audit_log_lost("audit: out of memory in audit_log_start");
+	if (!ab)
+		return NULL;
+
+	atomic_inc(&audit_backlog);
+	skb_queue_head_init(&ab->sklist);
+
+	ab->ctx   = ctx;
+	ab->len   = 0;
+	ab->nlh   = NULL;
+	ab->total = 0;
+	ab->type  = AUDIT_KERNEL;
+	ab->pid   = 0;
+	ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (ab->ctx)
+		audit_get_stamp(ab->ctx, &t, &serial);
+	else
+#endif
+		t = CURRENT_TIME;
+
+	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+			 t.tv_sec, t.tv_nsec/1000000, serial);
+	return ab;
+}
+
+
+/* Format an audit message into the audit buffer.  If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time.  Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+			      va_list args)
+{
+	int len, avail;
+
+	if (!ab)
+		return;
+
+	avail = sizeof(ab->tmp) - ab->len;
+	if (avail <= 0) {
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+	}
+	len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	if (len >= avail) {
+		/* The printk buffer is 1024 bytes long, so if we get
+		 * here and AUDIT_BUFSIZ is at least 1024, then we can
+		 * log everything that printk could have logged. */
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+		len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	}
+	ab->len   += (len < avail) ? len : avail;
+	ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer.  All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+	va_list args;
+
+	if (!ab)
+		return;
+	va_start(args, fmt);
+	audit_log_vformat(ab, fmt, args);
+	va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+		      struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	char *p;
+	int  len, avail;
+
+	if (prefix) audit_log_format(ab, " %s", prefix);
+
+	if (ab->len > 128)
+		audit_log_move(ab);
+	avail = sizeof(ab->tmp) - ab->len;
+	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+	if (p == ERR_PTR(-ENAMETOOLONG)) {
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<toolong>");
+	} else {
+				/* path isn't at start of buffer */
+		len	   = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+		memmove(ab->tmp + ab->len, p, len);
+		ab->len   += len;
+		ab->total += len;
+	}
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+	LIST_HEAD(list);
+	struct audit_buffer *ab;
+	unsigned long	    flags;
+
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_splice_init(&audit_txlist, &list);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	while (!list_empty(&list)) {
+		ab = list_entry(list.next, struct audit_buffer, list);
+		list_del(&ab->list);
+		audit_log_end_fast(ab);
+	}
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context.  May be called in
+ * any context. */
+void audit_log_end_irq(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	if (!ab)
+		return;
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_add_tail(&ab->list, &audit_txlist);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space.  May not
+ * be called in an irq context. */
+void audit_log_end_fast(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	BUG_ON(in_irq());
+	if (!ab)
+		return;
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		audit_log_move(ab);
+		if (audit_log_drain(ab))
+			return;
+	}
+
+	atomic_dec(&audit_backlog);
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context.  (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+	if (in_irq())
+		audit_log_end_irq(ab);
+	else
+		audit_log_end_fast(ab);
+}
+
+/* Log an audit record.  This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+	struct audit_buffer *ab;
+	va_list args;
+
+	ab = audit_log_start(ctx);
+	if (ab) {
+		va_start(args, fmt);
+		audit_log_vformat(ab, fmt, args);
+		va_end(args);
+		audit_log_end(ab);
+	}
+}
+
+EXPORT_SYMBOL_GPL(audit_set_rate_limit);
+EXPORT_SYMBOL_GPL(audit_set_backlog_limit);
+EXPORT_SYMBOL_GPL(audit_set_enabled);
+EXPORT_SYMBOL_GPL(audit_set_failure);
+
+EXPORT_SYMBOL_GPL(audit_log_start);
+EXPORT_SYMBOL_GPL(audit_log_format);
+EXPORT_SYMBOL_GPL(audit_log_end_irq);
+EXPORT_SYMBOL_GPL(audit_log_end_fast);
+EXPORT_SYMBOL_GPL(audit_log_end);
+EXPORT_SYMBOL_GPL(audit_log);
+EXPORT_SYMBOL_GPL(audit_log_d_path);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..342b57141fd9
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,922 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES    20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+	const char	*name;
+	unsigned long	ino;
+	dev_t		rdev;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+	int		    in_syscall;	/* 1 if task is in a syscall */
+	enum audit_state    state;
+	unsigned int	    serial;     /* serial number for record */
+	struct timespec	    ctime;      /* time of syscall entry */
+	uid_t		    loginuid;   /* login uid (identity) */
+	int		    major;      /* syscall number */
+	unsigned long	    argv[4];    /* syscall arguments */
+	int		    return_valid; /* return code is valid */
+	int		    return_code;/* syscall return code */
+	int		    auditable;  /* 1 if record should be written */
+	int		    name_count;
+	struct audit_names  names[AUDIT_NAMES];
+	struct audit_context *previous; /* For nested syscalls */
+
+				/* Save things to print about task_struct */
+	pid_t		    pid;
+	uid_t		    uid, euid, suid, fsuid;
+	gid_t		    gid, egid, sgid, fsgid;
+	unsigned long	    personality;
+
+#if AUDIT_DEBUG
+	int		    put_count;
+	int		    ino_count;
+#endif
+};
+
+				/* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+				 struct list_head *list)
+{
+	if (entry->rule.flags & AUDIT_PREPEND) {
+		entry->rule.flags &= ~AUDIT_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+	return 0;
+}
+
+static void audit_free_rule(void *arg)
+{
+	kfree(arg);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule, e);
+			return 0;
+		}
+	}
+	return -EFAULT;		/* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space.  Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+	u32		   flags;
+	struct audit_entry *entry;
+	int		   err = 0;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* The *_rcu iterators not needed here because we are
+		   always called with audit_netlink_sem held. */
+		list_for_each_entry(entry, &audit_tsklist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_entlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_extlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+		break;
+	case AUDIT_ADD:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+			return -ENOMEM;
+		if (audit_copy_rule(&entry->rule, data)) {
+			kfree(entry);
+			return -EINVAL;
+		}
+		flags = entry->rule.flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_add_rule(entry, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_add_rule(entry, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_add_rule(entry, &audit_extlist);
+		break;
+	case AUDIT_DEL:
+		flags =((struct audit_rule *)data)->flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_del_rule(data, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_del_rule(data, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_del_rule(data, &audit_extlist);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule.  Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+			      struct audit_rule *rule,
+			      struct audit_context *ctx,
+			      enum audit_state *state)
+{
+	int i, j;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = (tsk->pid == value);
+			break;
+		case AUDIT_UID:
+			result = (tsk->uid == value);
+			break;
+		case AUDIT_EUID:
+			result = (tsk->euid == value);
+			break;
+		case AUDIT_SUID:
+			result = (tsk->suid == value);
+			break;
+		case AUDIT_FSUID:
+			result = (tsk->fsuid == value);
+			break;
+		case AUDIT_GID:
+			result = (tsk->gid == value);
+			break;
+		case AUDIT_EGID:
+			result = (tsk->egid == value);
+			break;
+		case AUDIT_SGID:
+			result = (tsk->sgid == value);
+			break;
+		case AUDIT_FSGID:
+			result = (tsk->fsgid == value);
+			break;
+		case AUDIT_PERS:
+			result = (tsk->personality == value);
+			break;
+
+		case AUDIT_EXIT:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code == value);
+			break;
+		case AUDIT_SUCCESS:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code >= 0);
+			break;
+		case AUDIT_DEVMAJOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MAJOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_DEVMINOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_INODE:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].ino)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_LOGINUID:
+			result = 0;
+			if (ctx)
+				result = (ctx->loginuid == value);
+			break;
+		case AUDIT_ARG0:
+		case AUDIT_ARG1:
+		case AUDIT_ARG2:
+		case AUDIT_ARG3:
+			if (ctx)
+				result = (ctx->argv[field-AUDIT_ARG0]==value);
+			break;
+		}
+
+		if (rule->fields[i] & AUDIT_NEGATE)
+			result = !result;
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task.  Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_tsklist, list) {
+		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+					     struct audit_context *ctx,
+					     struct list_head *list)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int		   word = AUDIT_WORD(ctx->major);
+	int		   bit  = AUDIT_BIT(ctx->major);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit
+ 		    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+						      int return_valid,
+						      int return_code)
+{
+	struct audit_context *context = tsk->audit_context;
+
+	if (likely(!context))
+		return NULL;
+	context->return_valid = return_valid;
+	context->return_code  = return_code;
+
+	if (context->in_syscall && !context->auditable) {
+		enum audit_state state;
+		state = audit_filter_syscall(tsk, context, &audit_extlist);
+		if (state == AUDIT_RECORD_CONTEXT)
+			context->auditable = 1;
+	}
+
+	context->pid = tsk->pid;
+	context->uid = tsk->uid;
+	context->gid = tsk->gid;
+	context->euid = tsk->euid;
+	context->suid = tsk->suid;
+	context->fsuid = tsk->fsuid;
+	context->egid = tsk->egid;
+	context->sgid = tsk->sgid;
+	context->fsgid = tsk->fsgid;
+	context->personality = tsk->personality;
+	tsk->audit_context = NULL;
+	return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+	int i;
+
+#if AUDIT_DEBUG == 2
+	if (context->auditable
+	    ||context->put_count + context->ino_count != context->name_count) {
+		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		       " name_count=%d put_count=%d"
+		       " ino_count=%d [NOT freeing]\n",
+		       __LINE__,
+		       context->serial, context->major, context->in_syscall,
+		       context->name_count, context->put_count,
+		       context->ino_count);
+		for (i = 0; i < context->name_count; i++)
+			printk(KERN_ERR "names[%d] = %p = %s\n", i,
+			       context->names[i].name,
+			       context->names[i].name);
+		dump_stack();
+		return;
+	}
+#endif
+#if AUDIT_DEBUG
+	context->put_count  = 0;
+	context->ino_count  = 0;
+#endif
+
+	for (i = 0; i < context->name_count; i++)
+		if (context->names[i].name)
+			__putname(context->names[i].name);
+	context->name_count = 0;
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+				      enum audit_state state)
+{
+	uid_t loginuid = context->loginuid;
+
+	memset(context, 0, sizeof(*context));
+	context->state      = state;
+	context->loginuid   = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+	struct audit_context *context;
+
+	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+		return NULL;
+	audit_zero_context(context, state);
+	return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary.  Doing so turns on system call auditing for the
+ * specified task.  This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+	struct audit_context *context;
+	enum audit_state     state;
+
+	if (likely(!audit_enabled))
+		return 0; /* Return if not auditing. */
+
+	state = audit_filter_task(tsk);
+	if (likely(state == AUDIT_DISABLED))
+		return 0;
+
+	if (!(context = audit_alloc_context(state))) {
+		audit_log_lost("out of memory in audit_alloc");
+		return -ENOMEM;
+	}
+
+				/* Preserve login uid */
+	context->loginuid = -1;
+	if (tsk->audit_context)
+		context->loginuid = tsk->audit_context->loginuid;
+
+	tsk->audit_context  = context;
+	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+	return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+	struct audit_context *previous;
+	int		     count = 0;
+
+	do {
+		previous = context->previous;
+		if (previous || (count &&  count < 10)) {
+			++count;
+			printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+			       " freeing multiple contexts (%d)\n",
+			       context->serial, context->major,
+			       context->name_count, count);
+		}
+		audit_free_names(context);
+		kfree(context);
+		context  = previous;
+	} while (context);
+	if (count >= 10)
+		printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+	int i;
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "syscall=%d", context->major);
+	if (context->personality != PER_LINUX)
+		audit_log_format(ab, " per=%lx", context->personality);
+	if (context->return_valid)
+		audit_log_format(ab, " exit=%u", context->return_code);
+	audit_log_format(ab,
+		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+		  " pid=%d loginuid=%d uid=%d gid=%d"
+		  " euid=%d suid=%d fsuid=%d"
+		  " egid=%d sgid=%d fsgid=%d",
+		  context->argv[0],
+		  context->argv[1],
+		  context->argv[2],
+		  context->argv[3],
+		  context->name_count,
+		  context->pid,
+		  context->loginuid,
+		  context->uid,
+		  context->gid,
+		  context->euid, context->suid, context->fsuid,
+		  context->egid, context->sgid, context->fsgid);
+	audit_log_end(ab);
+	for (i = 0; i < context->name_count; i++) {
+		ab = audit_log_start(context);
+		if (!ab)
+			continue; /* audit_panic has been called */
+		audit_log_format(ab, "item=%d", i);
+		if (context->names[i].name)
+			audit_log_format(ab, " name=%s",
+					 context->names[i].name);
+		if (context->names[i].ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu",
+					 context->names[i].ino);
+		/* FIXME: should use format_dev_t, but ab structure is
+		 * opaque. */
+		if (context->names[i].rdev != -1)
+			audit_log_format(ab, " dev=%02x:%02x",
+					 MAJOR(context->names[i].rdev),
+					 MINOR(context->names[i].rdev));
+		audit_log_end(ab);
+	}
+}
+
+/* Free a per-task audit context.  Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+	struct audit_context *context;
+
+	task_lock(tsk);
+	context = audit_get_context(tsk, 0, 0);
+	task_unlock(tsk);
+
+	if (likely(!context))
+		return;
+
+	/* Check for system calls that do not go through the exit
+	 * function (e.g., exit_group), then free context block. */
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+	static atomic_t serial = ATOMIC_INIT(0xffffff);
+	unsigned int a, b;
+
+	do {
+		a = atomic_read(&serial);
+		if (atomic_dec_and_test(&serial))
+			atomic_set(&serial, 0xffffff);
+		b = atomic_read(&serial);
+	} while (b != a - 1);
+
+	return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry.  This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built.  If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+			 unsigned long a1, unsigned long a2,
+			 unsigned long a3, unsigned long a4)
+{
+	struct audit_context *context = tsk->audit_context;
+	enum audit_state     state;
+
+	BUG_ON(!context);
+
+	/* This happens only on certain architectures that make system
+	 * calls in kernel_thread via the entry.S interface, instead of
+	 * with direct calls.  (If you are porting to a new
+	 * architecture, hitting this condition can indicate that you
+	 * got the _exit/_leave calls backward in entry.S.)
+	 *
+	 * i386     no
+	 * x86_64   no
+	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 *
+	 * This also happens with vm86 emulation in a non-nested manner
+	 * (entries without exits), so this case must be caught.
+	 */
+	if (context->in_syscall) {
+		struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+		/* vm86 mode should only be entered once */
+		if (major == __NR_vm86 || major == __NR_vm86old)
+			return;
+#endif
+#if AUDIT_DEBUG
+		printk(KERN_ERR
+		       "audit(:%d) pid=%d in syscall=%d;"
+		       " entering syscall=%d\n",
+		       context->serial, tsk->pid, context->major, major);
+#endif
+		newctx = audit_alloc_context(context->state);
+		if (newctx) {
+			newctx->previous   = context;
+			context		   = newctx;
+			tsk->audit_context = newctx;
+		} else	{
+			/* If we can't alloc a new context, the best we
+			 * can do is to leak memory (any pending putname
+			 * will be lost).  The only other alternative is
+			 * to abandon auditing. */
+			audit_zero_context(context, context->state);
+		}
+	}
+	BUG_ON(context->in_syscall || context->name_count);
+
+	if (!audit_enabled)
+		return;
+
+	context->major      = major;
+	context->argv[0]    = a1;
+	context->argv[1]    = a2;
+	context->argv[2]    = a3;
+	context->argv[3]    = a4;
+
+	state = context->state;
+	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+		state = audit_filter_syscall(tsk, context, &audit_entlist);
+	if (likely(state == AUDIT_DISABLED))
+		return;
+
+	context->serial     = audit_serial();
+	context->ctime      = CURRENT_TIME;
+	context->in_syscall = 1;
+	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call.  If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information.  In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+	struct audit_context *context;
+
+	get_task_struct(tsk);
+	task_lock(tsk);
+	context = audit_get_context(tsk, 1, return_code);
+	task_unlock(tsk);
+
+	/* Not having a context here is ok, since the parent may have
+	 * called __put_task_struct. */
+	if (likely(!context))
+		return;
+
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	context->in_syscall = 0;
+	context->auditable  = 0;
+	if (context->previous) {
+		struct audit_context *new_context = context->previous;
+		context->previous  = NULL;
+		audit_free_context(context);
+		tsk->audit_context = new_context;
+	} else {
+		audit_free_names(context);
+		audit_zero_context(context, context->state);
+		tsk->audit_context = context;
+	}
+	put_task_struct(tsk);
+}
+
+/* Add a name to the list.  Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		dump_stack();
+#endif
+		return;
+	}
+	BUG_ON(context->name_count >= AUDIT_NAMES);
+	context->names[context->name_count].name = name;
+	context->names[context->name_count].ino  = (unsigned long)-1;
+	context->names[context->name_count].rdev = -1;
+	++context->name_count;
+}
+
+/* Intercept a putname request.  Called from
+ * include/linux/fs.h:putname().  If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		if (context->name_count) {
+			int i;
+			for (i = 0; i < context->name_count; i++)
+				printk(KERN_ERR "name[%d] = %p = %s\n", i,
+				       context->names[i].name,
+				       context->names[i].name);
+		}
+#endif
+		__putname(name);
+	}
+#if AUDIT_DEBUG
+	else {
+		++context->put_count;
+		if (context->put_count > context->name_count) {
+			printk(KERN_ERR "%s:%d(:%d): major=%d"
+			       " in_syscall=%d putname(%p) name_count=%d"
+			       " put_count=%d\n",
+			       __FILE__, __LINE__,
+			       context->serial, context->major,
+			       context->in_syscall, name, context->name_count,
+			       context->put_count);
+			dump_stack();
+		}
+	}
+#endif
+}
+
+/* Store the inode and device from a lookup.  Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, unsigned long ino, dev_t rdev)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+	if (context->name_count
+	    && context->names[context->name_count-1].name
+	    && context->names[context->name_count-1].name == name)
+		idx = context->name_count - 1;
+	else if (context->name_count > 1
+		 && context->names[context->name_count-2].name
+		 && context->names[context->name_count-2].name == name)
+		idx = context->name_count - 2;
+	else {
+		/* FIXME: how much do we care about inodes that have no
+		 * associated name? */
+		if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+			return;
+		idx = context->name_count++;
+		context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+		++context->ino_count;
+#endif
+	}
+	context->names[idx].ino  = ino;
+	context->names[idx].rdev = rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+		     struct timespec *t, int *serial)
+{
+	if (ctx) {
+		t->tv_sec  = ctx->ctime.tv_sec;
+		t->tv_nsec = ctx->ctime.tv_nsec;
+		*serial    = ctx->serial;
+		ctx->auditable = 1;
+	} else {
+		*t      = CURRENT_TIME;
+		*serial = 0;
+	}
+}
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+	if (ctx) {
+		if (loginuid < 0)
+			return -EINVAL;
+		ctx->loginuid = loginuid;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(audit_alloc);
+EXPORT_SYMBOL_GPL(audit_free);
+EXPORT_SYMBOL_GPL(audit_syscall_entry);
+EXPORT_SYMBOL_GPL(audit_syscall_exit);
+EXPORT_SYMBOL_GPL(audit_getname);
+EXPORT_SYMBOL_GPL(audit_putname);
+EXPORT_SYMBOL_GPL(audit_inode);
diff --git a/kernel/exit.c b/kernel/exit.c
index 308f6959add6..0ec66729ead8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -92,7 +92,6 @@ repeat:
 	p->parent->cstime += p->stime + p->cstime;
 	p->parent->cmin_flt += p->min_flt + p->cmin_flt;
 	p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
-	p->parent->cnswap += p->nswap + p->cnswap;
 	p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
 	p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
 	sched_exit(p);
@@ -136,13 +135,13 @@ int session_of_pgrp(int pgrp)
 
 	read_lock(&tasklist_lock);
 	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
-		if (p->session > 0) {
-			sid = p->session;
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
 			goto out;
 		}
 	p = find_task_by_pid(pgrp);
 	if (p)
-		sid = p->session;
+		sid = p->signal->session;
 out:
 	read_unlock(&tasklist_lock);
 	
@@ -170,7 +169,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
-			    && p->real_parent->session == p->session) {
+			    && p->real_parent->signal->session == p->signal->session) {
 			ret = 0;
 			break;
 		}
@@ -259,14 +258,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
 	struct task_struct *curr = current;
 
-	if (curr->session != session) {
+	if (curr->signal->session != session) {
 		detach_pid(curr, PIDTYPE_SID);
-		curr->session = session;
+		curr->signal->session = session;
 		attach_pid(curr, PIDTYPE_SID, session);
 	}
 	if (process_group(curr) != pgrp) {
 		detach_pid(curr, PIDTYPE_PGID);
-		curr->group_leader->__pgrp = pgrp;
+		curr->signal->pgrp = pgrp;
 		attach_pid(curr, PIDTYPE_PGID, pgrp);
 	}
 }
@@ -341,7 +340,7 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	current->tty = NULL;
+	current->signal->tty = NULL;
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
@@ -564,7 +563,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 	 * outside, so the child pgrp is now orphaned.
 	 */
 	if ((process_group(p) != process_group(father)) &&
-	    (p->session == father->session)) {
+	    (p->signal->session == father->signal->session)) {
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
@@ -675,7 +674,7 @@ static void exit_notify(struct task_struct *tsk)
 	t = tsk->real_parent;
 	
 	if ((process_group(t) != process_group(tsk)) &&
-	    (t->session == tsk->session) &&
+	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
 		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
@@ -777,10 +776,9 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	__exit_files(tsk);
 	__exit_fs(tsk);
 	exit_namespace(tsk);
-	exit_itimers(tsk);
 	exit_thread();
 
-	if (tsk->leader)
+	if (tsk->signal->leader)
 		disassociate_ctty(1);
 
 	module_put(tsk->thread_info->exec_domain->module);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b17a249c50d..4f5b018777d8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/namespace.h>
 #include <linux/personality.h>
+#include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -31,6 +32,7 @@
 #include <linux/futex.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -39,9 +41,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_sem(struct task_struct *tsk);
-
 /* The idle threads do not count..
  * Protected by write_lock_irq(&tasklist_lock)
  */
@@ -85,6 +84,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -209,11 +210,14 @@ EXPORT_SYMBOL(autoremove_wake_function);
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	0
+#endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct",
-				  sizeof(struct task_struct),0,
-				  SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+				  sizeof(struct task_struct),ARCH_MIN_TASKALIGN,
+				  0, NULL, NULL);
 	if (!task_struct_cachep)
 		panic("fork_init(): cannot create task_struct SLAB cache");
 #endif
@@ -322,7 +326,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
       
 			/* insert tmp into the share list, just after mpnt */
 			down(&file->f_mapping->i_shared_sem);
-			list_add_tail(&tmp->shared, &mpnt->shared);
+			list_add(&tmp->shared, &mpnt->shared);
 			up(&file->f_mapping->i_shared_sem);
 		}
 
@@ -512,7 +516,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->cmin_flt = tsk->cmaj_flt = 0;
-	tsk->nswap = tsk->cnswap = 0;
 	tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0;
 
 	tsk->mm = NULL;
@@ -812,6 +815,13 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->group_stop_count = 0;
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
+
+	sig->tty = current->signal->tty;
+	sig->pgrp = process_group(current);
+	sig->session = current->signal->session;
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
 
 	return 0;
 }
@@ -923,7 +933,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-	INIT_LIST_HEAD(&p->posix_timers);
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -937,21 +946,22 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;
 
-	p->leader = 0;		/* session leadership doesn't inherit */
-	p->tty_old_pgrp = 0;
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
 	p->io_context = NULL;
+	p->audit_context = NULL;
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
 		goto bad_fork_cleanup;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_audit;
 	if ((retval = copy_files(clone_flags, p)))
 		goto bad_fork_cleanup_semundo;
 	if ((retval = copy_fs(clone_flags, p)))
@@ -1057,7 +1067,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->session);
+		attach_pid(p, PIDTYPE_SID, p->signal->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	} else
@@ -1076,6 +1086,8 @@ bad_fork_cleanup_namespace:
 	exit_namespace(p);
 bad_fork_cleanup_mm:
 	exit_mm(p);
+	if (p->active_mm)
+		mmdrop(p->active_mm);
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
@@ -1086,6 +1098,8 @@ bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
 	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
 bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5261de82029b..0002fcd4c554 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 	};
 	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
-	if (!system_running)
+	if (system_state != SYSTEM_RUNNING)
 		return -EBUSY;
 
 	if (path[0] == '\0')
diff --git a/kernel/module.c b/kernel/module.c
index 16587e133b1b..a472deef9bdf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -493,7 +493,6 @@ static inline int __try_stop_module(void *_sref)
 	}
 
 	/* Mark it as dying. */
-	sref->mod->waiter = current;
 	sref->mod->state = MODULE_STATE_GOING;
 	return 0;
 }
@@ -588,6 +587,9 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		}
 	}
 
+	/* Set this up before setting mod->state */
+	mod->waiter = current;
+
 	/* Stop the machine so refcounts can't move and disable module. */
 	ret = try_stop_module(mod, flags, &forced);
 
diff --git a/kernel/params.c b/kernel/params.c
index 4d9a71b743c5..59667bce9ce0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -96,6 +96,13 @@ static char *next_arg(char *args, char **param, char **val)
 	else {
 		args[equals] = '\0';
 		*val = args + equals + 1;
+
+		/* Don't include quotes in value. */
+		if (**val == '"') {
+			(*val)++;
+			if (args[i-1] == '"')
+				args[i-1] = '\0';
+		}
 	}
 
 	if (args[i]) {
diff --git a/kernel/pid.c b/kernel/pid.c
index 4c85144759c5..6ed44f56ca45 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 
 	attach_pid(thread, PIDTYPE_PID, thread->pid);
 	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->session);
+	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
 	list_add_tail(&thread->tasks, &init_task.tasks);
 
 	attach_pid(leader, PIDTYPE_PID, leader->pid);
 	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->session);
+	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 
 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 082693e383cf..3de4d0ae9d26 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -317,12 +317,21 @@ static void timer_notify_task(struct k_itimer *timr)
 	if (timr->it_incr)
 		timr->sigq->info.si_sys_private = ++timr->it_requeue_pending;
 
-	if (timr->it_sigev_notify & SIGEV_THREAD_ID )
+	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+		if (unlikely(timr->it_process->flags & PF_EXITING)) {
+			timr->it_sigev_notify = SIGEV_SIGNAL;
+			put_task_struct(timr->it_process);
+			timr->it_process = timr->it_process->group_leader;
+			goto group;
+		}
 		ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
-	else
+	}
+	else {
+	group:
 		ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
+	}
 	if (ret) {
 		/*
 		 * signal was not sent because of sig_ignor
@@ -352,7 +361,7 @@ static void posix_timer_fn(unsigned long __data)
 
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
-	struct task_struct *rtn = current;
+	struct task_struct *rtn = current->group_leader;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
@@ -395,11 +404,15 @@ static struct k_itimer * alloc_posix_timer(void)
 static void release_posix_timer(struct k_itimer *tmr)
 {
 	if (tmr->it_id != -1) {
-		spin_lock_irq(&idr_lock);
+		unsigned long flags;
+		spin_lock_irqsave(&idr_lock, flags);
 		idr_remove(&posix_timers_id, tmr->it_id);
-		spin_unlock_irq(&idr_lock);
+		spin_unlock_irqrestore(&idr_lock, flags);
 	}
 	sigqueue_free(tmr->sigq);
+	if (unlikely(tmr->it_process) &&
+	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(tmr->it_process);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -414,6 +427,7 @@ sys_timer_create(clockid_t which_clock,
 	struct k_itimer *new_timer = NULL;
 	timer_t new_timer_id;
 	struct task_struct *process = 0;
+	unsigned long flags;
 	sigevent_t event;
 
 	if ((unsigned) which_clock >= MAX_CLOCKS ||
@@ -458,7 +472,7 @@ sys_timer_create(clockid_t which_clock,
 			 * We may be setting up this process for another
 			 * thread.  It may be exiting.  To catch this
 			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the task_lock will catch
+			 * the flag is not set, the siglock will catch
 			 * him before it is too late (in exit_itimers).
 			 *
 			 * The exec case is a bit more invloved but easy
@@ -469,13 +483,14 @@ sys_timer_create(clockid_t which_clock,
 			 * for us to die which means we can finish this
 			 * linkage with our last gasp. I.e. no code :)
 			 */
-			task_lock(process);
+			spin_lock_irqsave(&process->sighand->siglock, flags);
 			if (!(process->flags & PF_EXITING)) {
 				list_add(&new_timer->list,
-					 &process->posix_timers);
-				task_unlock(process);
+					 &process->signal->posix_timers);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				get_task_struct(process);
 			} else {
-				task_unlock(process);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 				process = 0;
 			}
 		}
@@ -491,10 +506,10 @@ sys_timer_create(clockid_t which_clock,
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;
 		new_timer->it_sigev_signo = SIGALRM;
 		new_timer->it_sigev_value.sival_int = new_timer->it_id;
-		process = current;
-		task_lock(process);
-		list_add(&new_timer->list, &process->posix_timers);
-		task_unlock(process);
+		process = current->group_leader;
+		spin_lock_irqsave(&process->sighand->siglock, flags);
+		list_add(&new_timer->list, &process->signal->posix_timers);
+		spin_unlock_irqrestore(&process->sighand->siglock, flags);
 	}
 
 	new_timer->it_clock = which_clock;
@@ -925,14 +940,18 @@ retry_delete:
 #else
 	p_timer_del(&posix_clocks[timer->it_clock], timer);
 #endif
-	task_lock(timer->it_process);
+	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
-	task_unlock(timer->it_process);
+	spin_unlock(&current->sighand->siglock);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
 	timer->it_process = NULL;
+	}
 	unlock_timer(timer, flags);
 	release_posix_timer(timer);
 	return 0;
@@ -942,24 +961,50 @@ retry_delete:
  */
 static inline void itimer_delete(struct k_itimer *timer)
 {
-	if (sys_timer_delete(timer->it_id))
-		BUG();
+	unsigned long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+retry_delete:
+#endif
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	p_timer_del(&posix_clocks[timer->it_clock], timer);
+#endif
+	list_del(&timer->list);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer);
 }
+
 /*
- * This is exported to exit and exec
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
  */
-void exit_itimers(struct task_struct *tsk)
+void exit_itimers(struct signal_struct *sig)
 {
 	struct k_itimer *tmr;
 
-	task_lock(tsk);
-	while (!list_empty(&tsk->posix_timers)) {
-		tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list);
-		task_unlock(tsk);
+	while (!list_empty(&sig->posix_timers)) {
+		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
-		task_lock(tsk);
 	}
-	task_unlock(tsk);
 }
 
 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 033eea403f26..6bb62269f3eb 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -9,9 +9,9 @@ config PM
 
 	  Power Management is most important for battery powered laptop
 	  computers; if you have a laptop, check out the Linux Laptop home
-	  page on the WWW at
-	  <http://www.cs.utexas.edu/users/kharker/linux-laptop/> and the
-	  Battery Powered Linux mini-HOWTO, available from
+	  page on the WWW at <http://www.linux-on-laptops.com/> or
+	  Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  Note that, even if you say N here, Linux on the x86 architecture
@@ -44,7 +44,7 @@ config SOFTWARE_SUSPEND
 
 config PM_DISK
 	bool "Suspend-to-Disk Support"
-	depends on PM && SWAP
+	depends on PM && SWAP && X86 && !X86_64
 	---help---
 	  Suspend-to-disk is a power management state in which the contents
 	  of memory are stored on disk and the entire system is shut down or
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d4cf319b8e1..6abcf99b7ada 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -84,7 +84,6 @@ static void free_some_memory(void)
 	while (shrink_all_memory(10000))
 		printk(".");
 	printk("|\n");
-	blk_run_queues();
 }
 
 
@@ -285,11 +284,16 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 {
 	int error = 0;
 	int i;
+	int len;
+	char *p;
 	u32 mode = 0;
 
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
 	down(&pm_sem);
 	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
-		if (!strcmp(buf,pm_disk_modes[i])) {
+		if (!strncmp(buf, pm_disk_modes[i], len)) {
 			mode = i;
 			break;
 		}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fd212e7ecd9f..d582906fecc6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -218,10 +218,15 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
 {
 	u32 state = PM_SUSPEND_STANDBY;
 	char ** s;
+	char *p;
 	int error;
+	int len;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
 
 	for (s = &pm_states[state]; *s; s++, state++) {
-		if (!strcmp(buf,*s))
+		if (!strncmp(buf, *s, len))
 			break;
 	}
 	if (*s)
diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c
index 1dc29b53a25e..22855abbdd6e 100644
--- a/kernel/power/pmdisk.c
+++ b/kernel/power/pmdisk.c
@@ -35,7 +35,7 @@
 #include "power.h"
 
 
-extern int pmdisk_arch_suspend(int resume);
+extern asmlinkage int pmdisk_arch_suspend(int resume);
 
 #define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
 #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err)
 
 static void wait_io(void)
 {
-	blk_run_queues();
 	while(atomic_read(&io_done))
 		io_schedule();
 }
@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page)
 	if (rw == WRITE)
 		bio_set_pages_dirty(bio);
 	start_io();
-	submit_bio(rw,bio);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 	wait_io();
  Done:
 	bio_put(bio);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 15c1b340c2ed..8225457183ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,7 +30,8 @@ static inline int freezeable(struct task_struct * p)
 	if ((p == current) || 
 	    (p->flags & PF_IOTHREAD) || 
 	    (p->state == TASK_ZOMBIE) ||
-	    (p->state == TASK_DEAD))
+	    (p->state == TASK_DEAD) ||
+	    (p->state == TASK_STOPPED))
 		return 0;
 	return 1;
 }
@@ -38,21 +39,19 @@ static inline int freezeable(struct task_struct * p)
 /* Refrigerator is place where frozen processes are stored :-). */
 void refrigerator(unsigned long flag)
 {
-	/* You need correct to work with real-time processes.
-	   OTOH, this way one process may see (via /proc/) some other
-	   process in stopped state (and thereby discovered we were
-	   suspended. We probably do not care. 
-	 */
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
 	long save;
 	save = current->state;
-	current->state = TASK_STOPPED;
+	current->state = TASK_UNINTERRUPTIBLE;
 	pr_debug("%s entered refrigerator\n", current->comm);
 	printk("=");
 	current->flags &= ~PF_FREEZE;
-	if (flag)
-		flush_signals(current); /* We have signaled a kernel thread, which isn't normal behaviour
-					   and that may lead to 100%CPU sucking because those threads
-					   just don't manage signals. */
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
 	current->flags |= PF_FROZEN;
 	while (current->flags & PF_FROZEN)
 		schedule();
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 20134ab8e0b2..8f78d6807576 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,11 @@
 /*
- * linux/kernel/suspend.c
+ * linux/kernel/power/swsusp.c
  *
  * This file is to realize architecture-independent
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -61,6 +61,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -74,11 +75,6 @@ unsigned char software_suspend_enabled = 0;
 #define NORESUME		1
 #define RESUME_SPECIFIED	2
 
-
-#define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
-#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
-#define ADDRESS2(x) __ADDRESS(__pa(x))		/* Needed for x86-64 where some pages are in memory twice */
-
 /* References to section boundaries */
 extern char __nosave_begin, __nosave_end;
 
@@ -105,6 +101,10 @@ unsigned int nr_copy_pages __nosavedata = 0;
    time of suspend, that must be freed. Second is "pagedir_nosave", 
    allocated at time of resume, that travels through memory not to
    collide with anything.
+
+   Warning: this is even more evil than it seems. Pagedirs this file
+   talks about are completely different from page directories used by
+   MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 static suspend_pagedir_t *pagedir_save;
@@ -139,15 +139,15 @@ static const char name_resume[] = "Resume Machine: ";
 #define TEST_SWSUSP 0		/* Set to 1 to reboot instead of halt machine after suspension */
 
 #ifdef DEBUG_DEFAULT
-# define PRINTK(f, a...)       printk(f, ## a)
+# define PRINTK(f, a...)	printk(f, ## a)
 #else
-# define PRINTK(f, a...)
+# define PRINTK(f, a...)       	do { } while(0)
 #endif
 
 #ifdef DEBUG_SLOW
 #define MDELAY(a) mdelay(a)
 #else
-#define MDELAY(a)
+#define MDELAY(a) do { } while(0)
 #endif
 
 /*
@@ -225,6 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
 static void read_swapfiles(void) /* This is called before saving image */
 {
 	int i, len;
+	static char buff[sizeof(resume_file)], *sname;
 	
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
@@ -243,8 +244,11 @@ static void read_swapfiles(void) /* This is called before saving image */
 					swapfile_used[i] = SWAPFILE_IGNORED;				  
 			} else {
 	  			/* we ignore all swap devices that are not the resume_file */
-				if (1) {
-// FIXME				if(resume_device == swap_info[i].swap_device) {
+				sname = d_path(swap_info[i].swap_file->f_dentry,
+					       swap_info[i].swap_file->f_vfsmnt,
+					       buff,
+					       sizeof(buff));
+				if (!strcmp(sname, resume_file)) {
 					swapfile_used[i] = SWAPFILE_SUSPEND;
 					root_swap = i;
 				} else {
@@ -346,7 +350,7 @@ static int write_suspend_image(void)
 
 	cur = (void *) buffer;
 	if (fill_suspend_header(&cur->sh))
-		panic("\nOut of memory while writing header");
+		BUG();		/* Not a BUG_ON(): we want fill_suspend_header to be called, always */
 		
 	cur->link.next = prev;
 
@@ -362,73 +366,174 @@ static int write_suspend_image(void)
 	return 0;
 }
 
-/* if pagedir_p != NULL it also copies the counted pages */
-static int count_and_copy_data_pages(struct pbe *pagedir_p)
-{
-	int chunk_size;
-	int nr_copy_pages = 0;
-	int pfn;
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
 	struct page *page;
-	
-#ifdef CONFIG_DISCONTIGMEM
-	panic("Discontingmem not supported");
-#else
-	BUG_ON (max_pfn != num_physpages);
-#endif
-	for (pfn = 0; pfn < max_pfn; pfn++) {
+	struct highmem_page *next;
+};
+
+struct highmem_page *highmem_copy = NULL;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		int chunk_size;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
-		if (PageHighMem(page))
-			panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
 
-		if (!PageReserved(page)) {
-			if (PageNosave(page))
-				continue;
+static int save_highmem(void)
+{
+	struct zone *zone;
+	int res = 0;
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+	return 0;
+}
 
-			if ((chunk_size=is_head_of_free_region(page))!=0) {
-				pfn += chunk_size - 1;
-				continue;
-			}
-		} else if (PageReserved(page)) {
-			BUG_ON (PageNosave(page));
+static int restore_highmem(void)
+{
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+	return 0;
+}
+#endif
 
-			/*
-			 * Just copy whole code segment. Hopefully it is not that big.
-			 */
-			if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
-			    (ADDRESS(pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
-				PRINTK("[nosave %lx]", ADDRESS(pfn));
-				continue;
-			}
-			/* Hmm, perhaps copying all reserved pages is not too healthy as they may contain 
-			   critical bios data? */
-		} else	BUG();
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
 
-		nr_copy_pages++;
-		if (pagedir_p) {
-			pagedir_p->orig_address = ADDRESS(pfn);
-			copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
-			pagedir_p++;
+/* if *pagedir_p != NULL it also copies the counted pages */
+static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
+{
+	unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
+	struct pbe *pbe = *pagedir_p;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		BUG_ON(PageReserved(page) && PageNosave(page));
+		if (PageNosave(page))
+			continue;
+		if (PageReserved(page) && pfn_is_nosave(pfn)) {
+			PRINTK("[nosave pfn 0x%lx]", pfn);
+			continue;
+		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
 		}
+		nr_copy_pages++;
+		if (!pbe)
+			continue;
+		pbe->orig_address = (long) page_address(page);
+		copy_page((void *)pbe->address, (void *)pbe->orig_address);
+		pbe++;
 	}
+	*pagedir_p = pbe;
 	return nr_copy_pages;
 }
 
-static void free_suspend_pagedir(unsigned long this_pagedir)
+static int count_and_copy_data_pages(struct pbe *pagedir_p)
 {
-	struct page *page;
-	int pfn;
-	unsigned long this_pagedir_end = this_pagedir +
-		(PAGE_SIZE << pagedir_order);
+	int nr_copy_pages = 0;
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
+	}
+	return nr_copy_pages;
+}
 
-	for(pfn = 0; pfn < num_physpages; pfn++) {
+static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
+{
+	unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
+	pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
+	pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
+	pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
 		if (!TestClearPageNosave(page))
 			continue;
+		else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
+			continue;
+		__free_page(page);
+	}
+}
 
-		if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
-			continue; /* old pagedir gets freed in one */
-		
-		free_page(ADDRESS(pfn));
+static void free_suspend_pagedir(unsigned long this_pagedir)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			free_suspend_pagedir_zone(zone, this_pagedir);
 	}
 	free_pages(this_pagedir, pagedir_order);
 }
@@ -443,7 +548,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
 	pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
 
 	p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
-	if(!pagedir)
+	if (!pagedir)
 		return NULL;
 
 	page = virt_to_page(pagedir);
@@ -452,7 +557,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
 		
 	while(nr_copy_pages--) {
 		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-		if(!p->address) {
+		if (!p->address) {
 			free_suspend_pagedir((unsigned long) pagedir);
 			return NULL;
 		}
@@ -492,10 +597,19 @@ static int suspend_prepare_image(void)
 	struct sysinfo i;
 	unsigned int nr_needed_pages = 0;
 
-	drain_local_pages();
-
 	pagedir_nosave = NULL;
-	printk( "/critical section: Counting pages to copy" );
+	printk( "/critical section: ");
+#ifdef CONFIG_HIGHMEM
+	printk( "handling highmem" );
+	if (save_highmem()) {
+		printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
+		return -ENOMEM;
+	}
+	printk(", ");
+#endif
+
+	printk("counting pages to copy" );
+	drain_local_pages();
 	nr_copy_pages = count_and_copy_data_pages(NULL);
 	nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
 	
@@ -504,23 +618,22 @@ static int suspend_prepare_image(void)
 		printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
 		       name_suspend, nr_needed_pages-nr_free_pages());
 		root_swap = 0xFFFF;
-		return 1;
+		return -ENOMEM;
 	}
 	si_swapinfo(&i);	/* FIXME: si_swapinfo(&i) returns all swap devices information.
 				   We should only consider resume_device. */
 	if (i.freeswap < nr_needed_pages)  {
 		printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
 		       name_suspend, nr_needed_pages-i.freeswap);
-		return 1;
+		return -ENOSPC;
 	}
 
 	PRINTK( "Alloc pagedir\n" ); 
 	pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
-	if(!pagedir_nosave) {
-		/* Shouldn't happen */
-		printk(KERN_CRIT "%sCouldn't allocate enough pages\n",name_suspend);
-		panic("Really should not happen");
-		return 1;
+	if (!pagedir_nosave) {
+		/* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
+		printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
+		return -ENOMEM;
 	}
 	nr_copy_pages_check = nr_copy_pages;
 	pagedir_order_check = pagedir_order;
@@ -603,21 +716,25 @@ asmlinkage void do_magic_resume_2(void)
 
 	PRINTK( "Freeing prev allocated pagedir\n" );
 	free_suspend_pagedir((unsigned long) pagedir_save);
+
+#ifdef CONFIG_HIGHMEM
+	printk( "Restoring highmem\n" );
+	restore_highmem();
+#endif
+	printk("done, devices\n");
+
 	device_power_up();
 	spin_unlock_irq(&suspend_pagedir_lock);
 	device_resume();
 
-	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
-	release_console_sem();
-
+	/* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
 	PRINTK( "Fixing swap signatures... " );
 	mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
 	PRINTK( "ok\n" );
 
 #ifdef SUSPEND_CONSOLE
 	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
+	update_screen(fg_console);
 	release_console_sem();
 #endif
 }
@@ -707,11 +824,6 @@ int software_suspend(void)
 
 		free_some_memory();
 		
-		/* No need to invalidate any vfsmnt list -- 
-		 * they will be valid after resume, anyway.
-		 */
-		blk_run_queues();
-
 		/* Save state of all device drivers, and stop them. */		   
 		if ((res = device_suspend(4))==0)
 			/* If stopping device drivers worked, we proceed basically into
diff --git a/kernel/printk.c b/kernel/printk.c
index a7be1f922f34..5f2b3c9bbd6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...)
 			log_level_unknown = 1;
 	}
 
-	if (!cpu_online(smp_processor_id()) && !system_running) {
+	if (!cpu_online(smp_processor_id()) &&
+	    system_state != SYSTEM_RUNNING) {
 		/*
 		 * Some console drivers may assume that per-cpu resources have
 		 * been allocated.  So don't allow them to be called by this
diff --git a/kernel/sched.c b/kernel/sched.c
index d5f21712ffbb..c2d1c0317130 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+extern unsigned long __scheduling_functions_start_here;
+extern unsigned long __scheduling_functions_end_here;
+const unsigned long scheduling_functions_start_here =
+			(unsigned long)&__scheduling_functions_start_here;
+const unsigned long scheduling_functions_end_here =
+			(unsigned long)&__scheduling_functions_end_here;
+
 /*
  * Default context-switch locking:
  */
@@ -1587,12 +1594,10 @@ out:
 	rebalance_tick(rq, 0);
 }
 
-void scheduling_functions_start_here(void) { }
-
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule);
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void preempt_schedule(void)
+asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 
@@ -1842,7 +1847,6 @@ void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exc
 		__wake_up_common(q, mode, nr_exclusive, 0);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
-
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 void fastcall complete(struct completion *x)
@@ -1855,7 +1859,6 @@ void fastcall complete(struct completion *x)
 			 1, 0);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
-
 EXPORT_SYMBOL(complete);
 
 void fastcall complete_all(struct completion *x)
@@ -1868,8 +1871,9 @@ void fastcall complete_all(struct completion *x)
 			 0, 0);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
+EXPORT_SYMBOL(complete_all);
 
-void fastcall wait_for_completion(struct completion *x)
+void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
@@ -1889,7 +1893,6 @@ void fastcall wait_for_completion(struct completion *x)
 	x->done--;
 	spin_unlock_irq(&x->wait.lock);
 }
-
 EXPORT_SYMBOL(wait_for_completion);
 
 #define	SLEEP_ON_VAR					\
@@ -1907,7 +1910,7 @@ EXPORT_SYMBOL(wait_for_completion);
 	__remove_wait_queue(q, &wait);			\
 	spin_unlock_irqrestore(&q->lock, flags);
 
-void fastcall interruptible_sleep_on(wait_queue_head_t *q)
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1920,7 +1923,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1935,7 +1938,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
-void fastcall sleep_on(wait_queue_head_t *q)
+void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1948,7 +1951,7 @@ void fastcall sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(sleep_on);
 
-long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1963,8 +1966,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
-void scheduling_functions_end_here(void) { }
-
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
@@ -2424,7 +2425,7 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-void __cond_resched(void)
+void __sched __cond_resched(void)
 {
 	set_current_state(TASK_RUNNING);
 	schedule();
@@ -2438,7 +2439,7 @@ EXPORT_SYMBOL(__cond_resched);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void yield(void)
+void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
@@ -2453,7 +2454,7 @@ EXPORT_SYMBOL(yield);
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
-void io_schedule(void)
+void __sched io_schedule(void)
 {
 	struct runqueue *rq = this_rq();
 
@@ -2464,7 +2465,7 @@ void io_schedule(void)
 
 EXPORT_SYMBOL(io_schedule);
 
-long io_schedule_timeout(long timeout)
+long __sched io_schedule_timeout(long timeout)
 {
 	struct runqueue *rq = this_rq();
 	long ret;
@@ -2982,7 +2983,8 @@ void __might_sleep(char *file, int line)
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) && system_running) {
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
@@ -3009,7 +3011,7 @@ EXPORT_SYMBOL(__might_sleep);
  *
  * Called inside preempt_disable().
  */
-void __preempt_spin_lock(spinlock_t *lock)
+void __sched __preempt_spin_lock(spinlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_spin_lock(lock);
@@ -3025,7 +3027,7 @@ void __preempt_spin_lock(spinlock_t *lock)
 
 EXPORT_SYMBOL(__preempt_spin_lock);
 
-void __preempt_write_lock(rwlock_t *lock)
+void __sched __preempt_write_lock(rwlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_write_lock(lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 32992a71683b..c69671600bef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -352,6 +352,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		exit_itimers(sig);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 		kmem_cache_free(signal_cachep, sig);
@@ -588,7 +589,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	error = -EPERM;
 	if ((!info || ((unsigned long)info != 1 &&
 			(unsigned long)info != 2 && SI_FROMUSER(info)))
-	    && ((sig != SIGCONT) || (current->session != t->session))
+	    && ((sig != SIGCONT) ||
+		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
 	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
 	    && !capable(CAP_KILL))
@@ -1103,7 +1105,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid)
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) {
-		if (!p->leader)
+		if (!p->signal->leader)
 			continue;
 		err = group_send_sig_info(sig, info, p);
 		if (retval)
@@ -2047,6 +2049,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 		err |= __put_user(from->si_stime, &to->si_stime);
 		break;
 	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_int, &to->si_int);
@@ -2553,4 +2556,3 @@ void __init signals_init(void)
 	if (!sigqueue_cachep)
 		panic("signals_init(): cannot create sigqueue SLAB cache");
 }
-
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81c79736ff9e..58c915c202ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 
+#include <asm/irq.h>
 /*
    - No shared variables, all the data are CPU local.
    - If a softirq needs serialization, let it serialize itself
@@ -69,53 +70,66 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void do_softirq(void)
+asmlinkage void __do_softirq(void)
 {
-	int max_restart = MAX_SOFTIRQ_RESTART;
+	struct softirq_action *h;
 	__u32 pending;
-	unsigned long flags;
+	int max_restart = MAX_SOFTIRQ_RESTART;
 
-	if (in_interrupt())
-		return;
+	pending = local_softirq_pending();
 
-	local_irq_save(flags);
+	local_bh_disable();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	local_softirq_pending() = 0;
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1)
+			h->action(h);
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
 
 	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
 
-	if (pending) {
-		struct softirq_action *h;
+	if (pending)
+		wakeup_softirqd();
 
-		local_bh_disable();
-restart:
-		/* Reset the pending bitmask before enabling irqs */
-		local_softirq_pending() = 0;
+	__local_bh_enable();
+}
 
-		local_irq_enable();
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
 
-		h = softirq_vec;
+	if (in_interrupt())
+		return;
 
-		do {
-			if (pending & 1)
-				h->action(h);
-			h++;
-			pending >>= 1;
-		} while (pending);
+	local_irq_save(flags);
 
-		local_irq_disable();
+	pending = local_softirq_pending();
 
-		pending = local_softirq_pending();
-		if (pending && --max_restart)
-			goto restart;
-		if (pending)
-			wakeup_softirqd();
-		__local_bh_enable();
-	}
+	if (pending)
+		__do_softirq();
 
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
 
+#endif
+
 void local_bh_enable(void)
 {
 	__local_bh_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a2cae39d322c..9610403ce2cf 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -149,10 +149,10 @@ static int do_stop(void *_smdata)
 	complete(&smdata->done);
 
 	/* Wait for kthread_stop */
-	__set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
-		__set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return ret;
diff --git a/kernel/sys.c b/kernel/sys.c
index 33a14e13079e..4d414d925889 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -260,6 +260,17 @@ cond_syscall(sys_msgctl)
 cond_syscall(sys_shmget)
 cond_syscall(sys_shmdt)
 cond_syscall(sys_shmctl)
+cond_syscall(sys_mq_open)
+cond_syscall(sys_mq_unlink)
+cond_syscall(sys_mq_timedsend)
+cond_syscall(sys_mq_timedreceive)
+cond_syscall(sys_mq_notify)
+cond_syscall(sys_mq_getsetattr)
+cond_syscall(compat_sys_mq_open)
+cond_syscall(compat_sys_mq_timedsend)
+cond_syscall(compat_sys_mq_timedreceive)
+cond_syscall(compat_sys_mq_notify)
+cond_syscall(compat_sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
@@ -436,7 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system.\n");
 		machine_restart(NULL);
@@ -452,7 +463,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_HALT:
 		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "System halted.\n");
 		machine_halt();
@@ -462,7 +473,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Power down.\n");
 		machine_power_off();
@@ -478,7 +489,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		buffer[sizeof(buffer) - 1] = '\0';
 
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
 		machine_restart(buffer);
@@ -979,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 
 	if (p->parent == current || p->real_parent == current) {
 		err = -EPERM;
-		if (p->session != current->session)
+		if (p->signal->session != current->signal->session)
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
@@ -991,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	}
 
 	err = -EPERM;
-	if (p->leader)
+	if (p->signal->leader)
 		goto out;
 
 	if (pgid != pid) {
@@ -1000,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct list_head *l;
 
 		for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid)
-			if (p->session == current->session)
+			if (p->signal->session == current->signal->session)
 				goto ok_pgid;
 		goto out;
 	}
@@ -1012,7 +1023,7 @@ ok_pgid:
 
 	if (process_group(p) != pgid) {
 		detach_pid(p, PIDTYPE_PGID);
-		p->group_leader->__pgrp = pgid;
+		p->signal->pgrp = pgid;
 		attach_pid(p, PIDTYPE_PGID, pgid);
 	}
 
@@ -1054,7 +1065,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
 	if (!pid) {
-		return current->session;
+		return current->signal->session;
 	} else {
 		int retval;
 		struct task_struct *p;
@@ -1066,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		if(p) {
 			retval = security_task_getsid(p);
 			if (!retval)
-				retval = p->session;
+				retval = p->signal->session;
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
@@ -1087,10 +1098,10 @@ asmlinkage long sys_setsid(void)
 	if (pid)
 		goto out;
 
-	current->leader = 1;
+	current->signal->leader = 1;
 	__set_special_pids(current->pid, current->pid);
-	current->tty = NULL;
-	current->tty_old_pgrp = 0;
+	current->signal->tty = NULL;
+	current->signal->tty_old_pgrp = 0;
 	err = process_group(current);
 out:
 	write_unlock_irq(&tasklist_lock);
@@ -1521,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw;
 			r.ru_minflt = p->min_flt;
 			r.ru_majflt = p->maj_flt;
-			r.ru_nswap = p->nswap;
 			break;
 		case RUSAGE_CHILDREN:
 			jiffies_to_timeval(p->cutime, &r.ru_utime);
@@ -1530,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->cnivcsw;
 			r.ru_minflt = p->cmin_flt;
 			r.ru_majflt = p->cmaj_flt;
-			r.ru_nswap = p->cnswap;
 			break;
 		default:
 			jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime);
@@ -1539,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw + p->cnivcsw;
 			r.ru_minflt = p->min_flt + p->cmin_flt;
 			r.ru_majflt = p->maj_flt + p->cmaj_flt;
-			r.ru_nswap = p->nswap + p->cnswap;
 			break;
 	}
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5f3123b0522..69e9123cdd0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -710,10 +710,12 @@ static ctl_table vm_table[] = {
 	 {
 		.ctl_name	= VM_HUGETLB_PAGES,
 		.procname	= "nr_hugepages",
-		.data		= &htlbpage_max,
-		.maxlen		= sizeof(int),
+		.data		= &max_huge_pages,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
 	 },
 #endif
 	{
@@ -722,7 +724,7 @@ static ctl_table vm_table[] = {
 		.data		= &sysctl_lower_zone_protection,
 		.maxlen		= sizeof(sysctl_lower_zone_protection),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &lower_zone_protection_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
@@ -744,6 +746,26 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= VM_LAPTOP_MODE,
+		.procname	= "laptop_mode",
+		.data		= &laptop_mode,
+		.maxlen		= sizeof(laptop_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_BLOCK_DUMP,
+		.procname	= "block_dump",
+		.data		= &block_dump,
+		.maxlen		= sizeof(block_dump),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/kernel/time.c b/kernel/time.c
index 33a6fe086304..142a4bd5771e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -51,10 +51,11 @@ EXPORT_SYMBOL(sys_tz);
 asmlinkage long sys_time(int * tloc)
 {
 	int i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
 
-	/* SMP: This is fairly trivial. We grab CURRENT_TIME and 
-	   stuff it to user space. No side effects */
-	i = get_seconds();
 	if (tloc) {
 		if (put_user(i,tloc))
 			i = -EFAULT;
diff --git a/kernel/timer.c b/kernel/timer.c
index f53e0749b0d2..cbcb5522866d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data)
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
-fastcall signed long schedule_timeout(signed long timeout)
+fastcall signed long __sched schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;
@@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long nanosleep_restart(struct restart_block *restart)
+static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	unsigned long expire = restart->arg0, now = jiffies;
 	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2004-04-12 02:21:00 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2004-04-12 02:21:00 -0700
commit	0d61fc5ea78015def4d2fcf9b598ecfe25210cdd (patch)
tree	a45aa0f67c6bbd200dc2a328e560042810307b1b /kernel
parent	f2eb250f07ba4695c2474cb8b6edbf64b5457d65 (diff)
parent	eb880e5457f8b4a61ff7fd36d47dd14fe51cb030 (diff)