summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@athlon.transmeta.com>2002-02-04 17:40:40 -0800
committerLinus Torvalds <torvalds@athlon.transmeta.com>2002-02-04 17:40:40 -0800
commit7a2deb32924142696b8174cdf9b38cd72a11fc96 (patch)
tree8ecc18f81fdb849254f39dc2e9fd77253319e1ec /kernel
Import changeset
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile32
-rw-r--r--kernel/acct.c373
-rw-r--r--kernel/capability.c216
-rw-r--r--kernel/context.c157
-rw-r--r--kernel/dma.c129
-rw-r--r--kernel/exec_domain.c163
-rw-r--r--kernel/exit.c596
-rw-r--r--kernel/fork.c771
-rw-r--r--kernel/info.c74
-rw-r--r--kernel/itimer.c170
-rw-r--r--kernel/kmod.c373
-rw-r--r--kernel/ksyms.c538
-rw-r--r--kernel/module.c1235
-rw-r--r--kernel/panic.c103
-rw-r--r--kernel/pm.c245
-rw-r--r--kernel/printk.c497
-rw-r--r--kernel/ptrace.c193
-rw-r--r--kernel/resource.c322
-rw-r--r--kernel/sched.c1269
-rw-r--r--kernel/signal.c1260
-rw-r--r--kernel/softirq.c317
-rw-r--r--kernel/sys.c1219
-rw-r--r--kernel/sysctl.c1309
-rw-r--r--kernel/time.c420
-rw-r--r--kernel/timer.c837
-rw-r--r--kernel/uid16.c163
-rw-r--r--kernel/user.c137
27 files changed, 13118 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 000000000000..9adeb6b2c392
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for the linux kernel.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definitions are now in the main makefile...
+
+O_TARGET := kernel.o
+
+export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o
+
+obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
+ module.o exit.o itimer.o info.o time.o softirq.o resource.o \
+ sysctl.o acct.o capability.o ptrace.o timer.o user.o \
+ signal.o sys.o kmod.o context.o
+
+obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_MODULES) += ksyms.o
+obj-$(CONFIG_PM) += pm.o
+
+ifneq ($(CONFIG_IA64),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only. Why this used to be enabled for all architectures is beyond
+# me. I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+include $(TOPDIR)/Rules.make
diff --git a/kernel/acct.c b/kernel/acct.c
new file mode 100644
index 000000000000..e2e8826fa6fd
--- /dev/null
+++ b/kernel/acct.c
@@ -0,0 +1,373 @@
+/*
+ * linux/kernel/acct.c
+ *
+ * BSD Process Accounting for Linux
+ *
+ * Author: Marco van Wieringen <mvw@planets.elm.net>
+ *
+ * Some code based on ideas and code from:
+ * Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ *
+ * This file implements BSD-style process accounting. Whenever any
+ * process exits, an accounting record of type "struct acct" is
+ * written to the file specified with the acct() system call. It is
+ * up to user-level programs to do useful things with the accounting
+ * log. The kernel just provides the raw accounting information.
+ *
+ * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
+ *
+ * Plugged two leaks. 1) It didn't return acct_file into the free_filps if
+ * the file happened to be read-only. 2) If the accounting was suspended
+ * due to the lack of space it happily allowed to reopen it and completely
+ * lost the old acct_file. 3/10/98, Al Viro.
+ *
+ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
+ * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
+ *
+ * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * was suspeneded we failed to stop it on umount(). Messy.
+ * Another one: remount to readonly didn't stop accounting.
+ * Question: what should we do if we have CAP_SYS_ADMIN but not
+ * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
+ * unless we are messing with the root. In that case we are getting a
+ * real mess with do_remount_sb(). 9/11/98, AV.
+ *
+ * Fixed a bunch of races (and pair of leaks). Probably not the best way,
+ * but this one obviously doesn't introduce deadlocks. Later. BTW, found
+ * one race (and leak) in BSD implementation.
+ * OK, that's better. ANOTHER race and leak in BSD variant. There always
+ * is one more bug... 10/11/98, AV.
+ *
+ * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
+ * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * a struct file opened for write. Fixed. 2/6/2000, AV.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * These constants control the amount of freespace that suspend and
+ * resume the process accounting system, and the time delay between
+ * each check.
+ * Turned into sysctl-controllable parameters. AV, 12/11/98
+ */
+
+int acct_parm[3] = {4, 2, 30};
+#define RESUME (acct_parm[0]) /* >foo% free space - resume */
+#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
+#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+
+/*
+ * External references and all of the globals.
+ */
+
+static volatile int acct_active;
+static volatile int acct_needcheck;
+static struct file *acct_file;
+static struct timer_list acct_timer;
+static void do_acct_process(long, struct file *);
+
+/*
+ * Called whenever the timer says to check the free space.
+ */
+static void acct_timeout(unsigned long unused)
+{
+ acct_needcheck = 1;
+}
+
+/*
+ * Check the amount of free space and suspend/resume accordingly.
+ */
+static int check_free_space(struct file *file)
+{
+ struct statfs sbuf;
+ int res;
+ int act;
+
+ lock_kernel();
+ res = acct_active;
+ if (!file || !acct_needcheck)
+ goto out;
+ unlock_kernel();
+
+ /* May block */
+ if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+ return res;
+
+ if (sbuf.f_bavail <= SUSPEND * sbuf.f_blocks / 100)
+ act = -1;
+ else if (sbuf.f_bavail >= RESUME * sbuf.f_blocks / 100)
+ act = 1;
+ else
+ act = 0;
+
+ /*
+ * If some joker switched acct_file under us we'ld better be
+ * silent and _not_ touch anything.
+ */
+ lock_kernel();
+ if (file != acct_file) {
+ if (act)
+ res = act>0;
+ goto out;
+ }
+
+ if (acct_active) {
+ if (act < 0) {
+ acct_active = 0;
+ printk(KERN_INFO "Process accounting paused\n");
+ }
+ } else {
+ if (act > 0) {
+ acct_active = 1;
+ printk(KERN_INFO "Process accounting resumed\n");
+ }
+ }
+
+ del_timer(&acct_timer);
+ acct_needcheck = 0;
+ acct_timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct_timer);
+ res = acct_active;
+out:
+ unlock_kernel();
+ return res;
+}
+
+/*
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
+ */
+asmlinkage long sys_acct(const char *name)
+{
+ struct file *file = NULL, *old_acct = NULL;
+ char *tmp;
+ int error;
+
+ if (!capable(CAP_SYS_PACCT))
+ return -EPERM;
+
+ if (name) {
+ tmp = getname(name);
+ error = PTR_ERR(tmp);
+ if (IS_ERR(tmp))
+ goto out;
+ /* Difference from BSD - they don't do O_APPEND */
+ file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+ putname(tmp);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out;
+ }
+ error = -EACCES;
+ if (!S_ISREG(file->f_dentry->d_inode->i_mode))
+ goto out_err;
+
+ error = -EIO;
+ if (!file->f_op->write)
+ goto out_err;
+ }
+
+ error = 0;
+ lock_kernel();
+ if (acct_file) {
+ old_acct = acct_file;
+ del_timer(&acct_timer);
+ acct_active = 0;
+ acct_needcheck = 0;
+ acct_file = NULL;
+ }
+ if (name) {
+ acct_file = file;
+ acct_needcheck = 0;
+ acct_active = 1;
+ /* It's been deleted if it was used before so this is safe */
+ init_timer(&acct_timer);
+ acct_timer.function = acct_timeout;
+ acct_timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+ add_timer(&acct_timer);
+ }
+ unlock_kernel();
+ if (old_acct) {
+ do_acct_process(0,old_acct);
+ filp_close(old_acct, NULL);
+ }
+out:
+ return error;
+out_err:
+ filp_close(file, NULL);
+ goto out;
+}
+
+void acct_auto_close(kdev_t dev)
+{
+ lock_kernel();
+ if (acct_file && acct_file->f_dentry->d_inode->i_dev == dev)
+ sys_acct(NULL);
+ unlock_kernel();
+}
+
+/*
+ * encode an unsigned long into a comp_t
+ *
+ * This routine has been adopted from the encode_comp_t() function in
+ * the kern_acct.c file of the FreeBSD operating system. The encoding
+ * is a 13-bit fraction with a 3-bit (base 8) exponent.
+ */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t encode_comp_t(unsigned long value)
+{
+ int exp, rnd;
+
+ exp = rnd = 0;
+ while (value > MAXFRACT) {
+ rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
+ value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT)) {
+ value >>= EXPSIZE;
+ exp++;
+ }
+
+ /*
+ * Clean it up and polish it off.
+ */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += value; /* and add on the mantissa. */
+ return exp;
+}
+
+/*
+ * Write an accounting entry for an exiting process
+ *
+ * The acct_process() call is the workhorse of the process
+ * accounting system. The struct acct is built here and then written
+ * into the accounting file. This function should only be called from
+ * do_exit().
+ */
+
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(long exitcode, struct file *file)
+{
+ struct acct ac;
+ mm_segment_t fs;
+ unsigned long vsize;
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(file))
+ return;
+
+ /*
+ * Fill the accounting struct with the needed info as recorded
+ * by the different kernel functions.
+ */
+ memset((caddr_t)&ac, 0, sizeof(struct acct));
+
+ strncpy(ac.ac_comm, current->comm, ACCT_COMM);
+ ac.ac_comm[ACCT_COMM - 1] = '\0';
+
+ ac.ac_btime = CT_TO_SECS(current->start_time) + (xtime.tv_sec - (jiffies / HZ));
+ ac.ac_etime = encode_comp_t(jiffies - current->start_time);
+ ac.ac_utime = encode_comp_t(current->times.tms_utime);
+ ac.ac_stime = encode_comp_t(current->times.tms_stime);
+ ac.ac_uid = current->uid;
+ ac.ac_gid = current->gid;
+ ac.ac_tty = (current->tty) ? kdev_t_to_nr(current->tty->device) : 0;
+
+ ac.ac_flag = 0;
+ if (current->flags & PF_FORKNOEXEC)
+ ac.ac_flag |= AFORK;
+ if (current->flags & PF_SUPERPRIV)
+ ac.ac_flag |= ASU;
+ if (current->flags & PF_DUMPCORE)
+ ac.ac_flag |= ACORE;
+ if (current->flags & PF_SIGNALED)
+ ac.ac_flag |= AXSIG;
+
+ vsize = 0;
+ if (current->mm) {
+ struct vm_area_struct *vma;
+ down(&current->mm->mmap_sem);
+ vma = current->mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ up(&current->mm->mmap_sem);
+ }
+ vsize = vsize / 1024;
+ ac.ac_mem = encode_comp_t(vsize);
+ ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
+ ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
+ ac.ac_minflt = encode_comp_t(current->min_flt);
+ ac.ac_majflt = encode_comp_t(current->maj_flt);
+ ac.ac_swaps = encode_comp_t(current->nswap);
+ ac.ac_exitcode = exitcode;
+
+ /*
+ * Kernel segment override to datasegment and write it
+ * to the accounting file.
+ */
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ file->f_op->write(file, (char *)&ac,
+ sizeof(struct acct), &file->f_pos);
+ set_fs(fs);
+}
+
+/*
+ * acct_process - now just a wrapper around do_acct_process
+ */
+int acct_process(long exitcode)
+{
+ struct file *file = NULL;
+ lock_kernel();
+ if (acct_file) {
+ file = acct_file;
+ get_file(file);
+ unlock_kernel();
+ do_acct_process(exitcode, acct_file);
+ fput(file);
+ } else
+ unlock_kernel();
+ return 0;
+}
+
+#else
+/*
+ * Dummy system call when BSD process accounting is not configured
+ * into the kernel.
+ */
+
+asmlinkage long sys_acct(const char * filename)
+{
+ return -ENOSYS;
+}
+#endif
diff --git a/kernel/capability.c b/kernel/capability.c
new file mode 100644
index 000000000000..7aaf1a423011
--- /dev/null
+++ b/kernel/capability.c
@@ -0,0 +1,216 @@
+/*
+ * linux/kernel/capability.c
+ *
+ * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
+ * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com>
+ */
+
+#include <linux/mm.h>
+#include <asm/uaccess.h>
+
+kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
+
+/* Note: never hold tasklist_lock while spinning for this one */
+spinlock_t task_capability_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * For sys_getproccap() and sys_setproccap(), any of the three
+ * capability set pointers may be NULL -- indicating that that set is
+ * uninteresting and/or not to be changed.
+ */
+
+asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+{
+ int error, pid;
+ __u32 version;
+ struct task_struct *target;
+ struct __user_cap_data_struct data;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ error = -EINVAL;
+ if (version != _LINUX_CAPABILITY_VERSION) {
+ version = _LINUX_CAPABILITY_VERSION;
+ if (put_user(version, &header->version))
+ error = -EFAULT;
+ return error;
+ }
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ error = 0;
+
+ spin_lock(&task_capability_lock);
+
+ if (pid && pid != current->pid) {
+ read_lock(&tasklist_lock);
+ target = find_task_by_pid(pid); /* identify target of query */
+ if (!target)
+ error = -ESRCH;
+ } else {
+ target = current;
+ }
+
+ if (!error) {
+ data.permitted = cap_t(target->cap_permitted);
+ data.inheritable = cap_t(target->cap_inheritable);
+ data.effective = cap_t(target->cap_effective);
+ }
+
+ if (target != current)
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_capability_lock);
+
+ if (!error) {
+ if (copy_to_user(dataptr, &data, sizeof data))
+ return -EFAULT;
+ }
+
+ return error;
+}
+
+/* set capabilities for all processes in a given process group */
+
+static void cap_set_pg(int pgrp,
+ kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ struct task_struct *target;
+
+ /* FIXME: do we need to have a write lock here..? */
+ read_lock(&tasklist_lock);
+ for_each_task(target) {
+ if (target->pgrp != pgrp)
+ continue;
+ target->cap_effective = *effective;
+ target->cap_inheritable = *inheritable;
+ target->cap_permitted = *permitted;
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/* set capabilities for all processes other than 1 and self */
+
+static void cap_set_all(kernel_cap_t *effective,
+ kernel_cap_t *inheritable,
+ kernel_cap_t *permitted)
+{
+ struct task_struct *target;
+
+ /* FIXME: do we need to have a write lock here..? */
+ read_lock(&tasklist_lock);
+ /* ALL means everyone other than self or 'init' */
+ for_each_task(target) {
+ if (target == current || target->pid == 1)
+ continue;
+ target->cap_effective = *effective;
+ target->cap_inheritable = *inheritable;
+ target->cap_permitted = *permitted;
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * The restrictions on setting capabilities are specified as:
+ *
+ * [pid is for the 'target' task. 'current' is the calling task.]
+ *
+ * I: any raised capabilities must be a subset of the (old current) Permitted
+ * P: any raised capabilities must be a subset of the (old current) permitted
+ * E: must be set to a subset of (new target) Permitted
+ */
+
+asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
+{
+ kernel_cap_t inheritable, permitted, effective;
+ __u32 version;
+ struct task_struct *target;
+ int error, pid;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ if (version != _LINUX_CAPABILITY_VERSION) {
+ version = _LINUX_CAPABILITY_VERSION;
+ if (put_user(version, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (pid && !capable(CAP_SETPCAP))
+ return -EPERM;
+
+ if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
+ copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
+ copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
+ return -EFAULT;
+
+ error = -EPERM;
+ spin_lock(&task_capability_lock);
+
+ if (pid > 0 && pid != current->pid) {
+ read_lock(&tasklist_lock);
+ target = find_task_by_pid(pid); /* identify target of query */
+ if (!target) {
+ error = -ESRCH;
+ goto out;
+ }
+ } else {
+ target = current;
+ }
+
+
+ /* verify restrictions on target's new Inheritable set */
+ if (!cap_issubset(inheritable,
+ cap_combine(target->cap_inheritable,
+ current->cap_permitted))) {
+ goto out;
+ }
+
+ /* verify restrictions on target's new Permitted set */
+ if (!cap_issubset(permitted,
+ cap_combine(target->cap_permitted,
+ current->cap_permitted))) {
+ goto out;
+ }
+
+ /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
+ if (!cap_issubset(effective, permitted)) {
+ goto out;
+ }
+
+ /* having verified that the proposed changes are legal,
+ we now put them into effect. */
+ error = 0;
+
+ if (pid < 0) {
+ if (pid == -1) /* all procs other than current and init */
+ cap_set_all(&effective, &inheritable, &permitted);
+
+ else /* all procs in process group */
+ cap_set_pg(-pid, &effective, &inheritable, &permitted);
+ goto spin_out;
+ } else {
+ /* FIXME: do we need to have a write lock here..? */
+ target->cap_effective = effective;
+ target->cap_inheritable = inheritable;
+ target->cap_permitted = permitted;
+ }
+
+out:
+ if (target != current) {
+ read_unlock(&tasklist_lock);
+ }
+spin_out:
+ spin_unlock(&task_capability_lock);
+ return error;
+}
diff --git a/kernel/context.c b/kernel/context.c
new file mode 100644
index 000000000000..864a70131c88
--- /dev/null
+++ b/kernel/context.c
@@ -0,0 +1,157 @@
+/*
+ * linux/kernel/context.c
+ *
+ * Mechanism for running arbitrary tasks in process context
+ *
+ * dwmw2@redhat.com: Genesis
+ *
+ * andrewm@uow.edu.au: 2.4.0-test12
+ * - Child reaping
+ * - Support for tasks which re-add themselves
+ * - flush_scheduled_tasks.
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/signal.h>
+
+static DECLARE_TASK_QUEUE(tq_context);
+static DECLARE_WAIT_QUEUE_HEAD(context_task_wq);
+static DECLARE_WAIT_QUEUE_HEAD(context_task_done);
+static int keventd_running;
+static struct task_struct *keventd_task;
+
+static int need_keventd(const char *who)
+{
+ if (keventd_running == 0)
+ printk(KERN_ERR "%s(): keventd has not started\n", who);
+ return keventd_running;
+}
+
+int current_is_keventd(void)
+{
+ int ret = 0;
+ if (need_keventd(__FUNCTION__))
+ ret = (current == keventd_task);
+ return ret;
+}
+
+/**
+ * schedule_task - schedule a function for subsequent execution in process context.
+ * @task: pointer to a &tq_struct which defines the function to be scheduled.
+ *
+ * May be called from interrupt context. The scheduled function is run at some
+ * time in the near future by the keventd kernel thread. If it can sleep, it
+ * should be designed to do so for the minimum possible time, as it will be
+ * stalling all other scheduled tasks.
+ *
+ * schedule_task() returns non-zero if the task was successfully scheduled.
+ * If @task is already residing on a task queue then schedule_task() fails
+ * to schedule your task and returns zero.
+ */
+int schedule_task(struct tq_struct *task)
+{
+ int ret;
+ need_keventd(__FUNCTION__);
+ ret = queue_task(task, &tq_context);
+ wake_up(&context_task_wq);
+ return ret;
+}
+
+static int context_thread(void *dummy)
+{
+ struct task_struct *curtask = current;
+ DECLARE_WAITQUEUE(wait, curtask);
+ struct k_sigaction sa;
+
+ daemonize();
+ strcpy(curtask->comm, "keventd");
+ keventd_running = 1;
+ keventd_task = curtask;
+
+ spin_lock_irq(&curtask->sigmask_lock);
+ siginitsetinv(&curtask->blocked, sigmask(SIGCHLD));
+ recalc_sigpending(curtask);
+ spin_unlock_irq(&curtask->sigmask_lock);
+
+ /* Install a handler so SIGCLD is delivered */
+ sa.sa.sa_handler = SIG_IGN;
+ sa.sa.sa_flags = 0;
+ siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
+ do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+
+ /*
+ * If one of the functions on a task queue re-adds itself
+ * to the task queue we call schedule() in state TASK_RUNNING
+ */
+ for (;;) {
+ set_task_state(curtask, TASK_INTERRUPTIBLE);
+ add_wait_queue(&context_task_wq, &wait);
+ if (TQ_ACTIVE(tq_context))
+ set_task_state(curtask, TASK_RUNNING);
+ schedule();
+ remove_wait_queue(&context_task_wq, &wait);
+ run_task_queue(&tq_context);
+ wake_up(&context_task_done);
+ if (signal_pending(curtask)) {
+ while (waitpid(-1, (unsigned int *)0, __WALL|WNOHANG) > 0)
+ ;
+ flush_signals(curtask);
+ recalc_sigpending(curtask);
+ }
+ }
+}
+
+/**
+ * flush_scheduled_tasks - ensure that any scheduled tasks have run to completion.
+ *
+ * Forces execution of the schedule_task() queue and blocks until its completion.
+ *
+ * If a kernel subsystem uses schedule_task() and wishes to flush any pending
+ * tasks, it should use this function. This is typically used in driver shutdown
+ * handlers.
+ *
+ * The caller should hold no spinlocks and should hold no semaphores which could
+ * cause the scheduled tasks to block.
+ */
+static struct tq_struct dummy_task;
+
+void flush_scheduled_tasks(void)
+{
+ int count;
+ DECLARE_WAITQUEUE(wait, current);
+
+ /*
+ * Do it twice. It's possible, albeit highly unlikely, that
+ * the caller queued a task immediately before calling us,
+ * and that the eventd thread was already past the run_task_queue()
+ * but not yet into wake_up(), so it woke us up before completing
+ * the caller's queued task or our new dummy task.
+ */
+ add_wait_queue(&context_task_done, &wait);
+ for (count = 0; count < 2; count++) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* Queue a dummy task to make sure we get kicked */
+ schedule_task(&dummy_task);
+
+ /* Wait for it to complete */
+ schedule();
+ }
+ remove_wait_queue(&context_task_done, &wait);
+}
+
+int start_context_thread(void)
+{
+ kernel_thread(context_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ return 0;
+}
+
+EXPORT_SYMBOL(schedule_task);
+EXPORT_SYMBOL(flush_scheduled_tasks);
+
diff --git a/kernel/dma.c b/kernel/dma.c
new file mode 100644
index 000000000000..3ee09759fda1
--- /dev/null
+++ b/kernel/dma.c
@@ -0,0 +1,129 @@
+/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $
+ * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
+ *
+ * Written by Hennus Bergman, 1992.
+ *
+ * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
+ * In the previous version the reported device could end up being wrong,
+ * if a device requested a DMA channel that was already in use.
+ * [It also happened to remove the sizeof(char *) == sizeof(int)
+ * assumption introduced because of those /proc/dma patches. -- Hennus]
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <asm/dma.h>
+#include <asm/system.h>
+
+
+
+/* A note on resource allocation:
+ *
+ * All drivers needing DMA channels, should allocate and release them
+ * through the public routines `request_dma()' and `free_dma()'.
+ *
+ * In order to avoid problems, all processes should allocate resources in
+ * the same sequence and release them in the reverse order.
+ *
+ * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
+ * When releasing them, first release the DMA, then release the IRQ.
+ * If you don't, you may cause allocation requests to fail unnecessarily.
+ * This doesn't really matter now, but it will once we get real semaphores
+ * in the kernel.
+ */
+
+
+spinlock_t dma_spin_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * If our port doesn't define this it has no PC like DMA
+ */
+
+#ifdef MAX_DMA_CHANNELS
+
+
+/* Channel n is busy iff dma_chan_busy[n].lock != 0.
+ * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
+ * DMA4 is reserved for cascading.
+ */
+
+struct dma_chan {
+ int lock;
+ const char *device_id;
+};
+
+static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
+ { 0, 0 },
+ { 0, 0 },
+ { 0, 0 },
+ { 0, 0 },
+ { 1, "cascade" },
+ { 0, 0 },
+ { 0, 0 },
+ { 0, 0 }
+};
+
+int get_dma_list(char *buf)
+{
+ int i, len = 0;
+
+ for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
+ if (dma_chan_busy[i].lock) {
+ len += sprintf(buf+len, "%2d: %s\n",
+ i,
+ dma_chan_busy[i].device_id);
+ }
+ }
+ return len;
+} /* get_dma_list */
+
+
+int request_dma(unsigned int dmanr, const char * device_id)
+{
+ if (dmanr >= MAX_DMA_CHANNELS)
+ return -EINVAL;
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
+ return -EBUSY;
+
+ dma_chan_busy[dmanr].device_id = device_id;
+
+ /* old flag was 0, now contains 1 to indicate busy */
+ return 0;
+} /* request_dma */
+
+
+void free_dma(unsigned int dmanr)
+{
+ if (dmanr >= MAX_DMA_CHANNELS) {
+ printk("Trying to free DMA%d\n", dmanr);
+ return;
+ }
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
+ printk("Trying to free free DMA%d\n", dmanr);
+ return;
+ }
+
+} /* free_dma */
+
+#else
+
+int request_dma(unsigned int dmanr, const char *device_id)
+{
+ return -EINVAL;
+}
+
+int free_dma(unsigned int dmanr)
+{
+ return -EINVAL;
+}
+
+int get_dma_list(char *buf)
+{
+ strcpy(buf, "No DMA\n");
+ return 7;
+}
+#endif
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
new file mode 100644
index 000000000000..1daf64cc19b6
--- /dev/null
+++ b/kernel/exec_domain.c
@@ -0,0 +1,163 @@
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+
+static asmlinkage void no_lcall7(int segment, struct pt_regs * regs);
+
+
+static unsigned long ident_map[32] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31
+};
+
+struct exec_domain default_exec_domain = {
+ "Linux", /* name */
+ no_lcall7, /* lcall7 causes a seg fault. */
+ 0, 0xff, /* All personalities. */
+ ident_map, /* Identity map signals. */
+ ident_map, /* - both ways. */
+ NULL, /* No usage counter. */
+ NULL /* Nothing after this in the list. */
+};
+
+static struct exec_domain *exec_domains = &default_exec_domain;
+static rwlock_t exec_domains_lock = RW_LOCK_UNLOCKED;
+
+static asmlinkage void no_lcall7(int segment, struct pt_regs * regs)
+{
+ /*
+ * This may have been a static linked SVr4 binary, so we would have the
+ * personality set incorrectly. Check to see whether SVr4 is available,
+ * and use it, otherwise give the user a SEGV.
+ */
+ set_personality(PER_SVR4);
+
+ if (current->exec_domain && current->exec_domain->handler
+ && current->exec_domain->handler != no_lcall7) {
+ current->exec_domain->handler(segment, regs);
+ return;
+ }
+
+ send_sig(SIGSEGV, current, 1);
+}
+
+static struct exec_domain *lookup_exec_domain(unsigned long personality)
+{
+ unsigned long pers = personality & PER_MASK;
+ struct exec_domain *it;
+
+ read_lock(&exec_domains_lock);
+ for (it=exec_domains; it; it=it->next)
+ if (pers >= it->pers_low && pers <= it->pers_high) {
+ if (!try_inc_mod_count(it->module))
+ continue;
+ read_unlock(&exec_domains_lock);
+ return it;
+ }
+ read_unlock(&exec_domains_lock);
+
+ /* Should never get this far. */
+ printk(KERN_ERR "No execution domain for personality 0x%02lx\n", pers);
+ return NULL;
+}
+
+int register_exec_domain(struct exec_domain *it)
+{
+ struct exec_domain *tmp;
+
+ if (!it)
+ return -EINVAL;
+ if (it->next)
+ return -EBUSY;
+ write_lock(&exec_domains_lock);
+ for (tmp=exec_domains; tmp; tmp=tmp->next)
+ if (tmp == it) {
+ write_unlock(&exec_domains_lock);
+ return -EBUSY;
+ }
+ it->next = exec_domains;
+ exec_domains = it;
+ write_unlock(&exec_domains_lock);
+ return 0;
+}
+
+int unregister_exec_domain(struct exec_domain *it)
+{
+ struct exec_domain ** tmp;
+
+ tmp = &exec_domains;
+ write_lock(&exec_domains_lock);
+ while (*tmp) {
+ if (it == *tmp) {
+ *tmp = it->next;
+ it->next = NULL;
+ write_unlock(&exec_domains_lock);
+ return 0;
+ }
+ tmp = &(*tmp)->next;
+ }
+ write_unlock(&exec_domains_lock);
+ return -EINVAL;
+}
+
+void __set_personality(unsigned long personality)
+{
+ struct exec_domain *it, *prev;
+
+ it = lookup_exec_domain(personality);
+ if (it == current->exec_domain) {
+ current->personality = personality;
+ return;
+ }
+ if (!it)
+ return;
+ if (atomic_read(&current->fs->count) != 1) {
+ struct fs_struct *new = copy_fs_struct(current->fs);
+ struct fs_struct *old;
+ if (!new) {
+ put_exec_domain(it);
+ return;
+ }
+ task_lock(current);
+ old = current->fs;
+ current->fs = new;
+ task_unlock(current);
+ put_fs_struct(old);
+ }
+ /*
+ * At that point we are guaranteed to be the sole owner of
+ * current->fs.
+ */
+ current->personality = personality;
+ prev = current->exec_domain;
+ current->exec_domain = it;
+ set_fs_altroot();
+ put_exec_domain(prev);
+}
+
+asmlinkage long sys_personality(unsigned long personality)
+{
+ int ret = current->personality;
+ if (personality != 0xffffffff) {
+ set_personality(personality);
+ if (current->personality != personality)
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+int get_exec_domain_list(char * page)
+{
+ int len = 0;
+ struct exec_domain * e;
+
+ read_lock(&exec_domains_lock);
+ for (e=exec_domains; e && len < PAGE_SIZE - 80; e=e->next)
+ len += sprintf(page+len, "%d-%d\t%-16s\t[%s]\n",
+ e->pers_low, e->pers_high, e->name,
+ e->module ? e->module->name : "kernel");
+ read_unlock(&exec_domains_lock);
+ return len;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
new file mode 100644
index 000000000000..c747f547b182
--- /dev/null
+++ b/kernel/exit.c
@@ -0,0 +1,596 @@
+/*
+ * linux/kernel/exit.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/malloc.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+extern void sem_exit (void);
+extern struct task_struct *child_reaper;
+
+int getrusage(struct task_struct *, int, struct rusage *);
+
+static void release_task(struct task_struct * p)
+{
+ if (p != current) {
+#ifdef CONFIG_SMP
+ /*
+ * Wait to make sure the process isn't on the
+ * runqueue (active on some other CPU still)
+ */
+ for (;;) {
+ task_lock(p);
+ if (!p->has_cpu)
+ break;
+ task_unlock(p);
+ do {
+ barrier();
+ } while (p->has_cpu);
+ }
+ task_unlock(p);
+#endif
+ atomic_dec(&p->user->processes);
+ free_uid(p->user);
+ unhash_process(p);
+
+ release_thread(p);
+ current->cmin_flt += p->min_flt + p->cmin_flt;
+ current->cmaj_flt += p->maj_flt + p->cmaj_flt;
+ current->cnswap += p->nswap + p->cnswap;
+ /*
+ * Potentially available timeslices are retrieved
+ * here - this way the parent does not get penalized
+ * for creating too many processes.
+ *
+ * (this cannot be used to artificially 'generate'
+ * timeslices, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+ current->counter += p->counter;
+ if (current->counter >= MAX_COUNTER)
+ current->counter = MAX_COUNTER;
+ free_task_struct(p);
+ } else {
+ printk("task releasing itself\n");
+ }
+}
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ */
+int session_of_pgrp(int pgrp)
+{
+ struct task_struct *p;
+ int fallback;
+
+ fallback = -1;
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (p->session <= 0)
+ continue;
+ if (p->pgrp == pgrp) {
+ fallback = p->session;
+ break;
+ }
+ if (p->pid == pgrp)
+ fallback = p->session;
+ }
+ read_unlock(&tasklist_lock);
+ return fallback;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52. Orphaned process groups are not to be affected
+ * by terminal-generated stop signals. Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task)
+{
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if ((p == ignored_task) || (p->pgrp != pgrp) ||
+ (p->state == TASK_ZOMBIE) ||
+ (p->p_pptr->pid == 1))
+ continue;
+ if ((p->p_pptr->pgrp != pgrp) &&
+ (p->p_pptr->session == p->session)) {
+ read_unlock(&tasklist_lock);
+ return 0;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return 1; /* (sighing) "Often!" */
+}
+
+int is_orphaned_pgrp(int pgrp)
+{
+ return will_become_orphaned_pgrp(pgrp, 0);
+}
+
+static inline int has_stopped_jobs(int pgrp)
+{
+ int retval = 0;
+ struct task_struct * p;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (p->pgrp != pgrp)
+ continue;
+ if (p->state != TASK_STOPPED)
+ continue;
+ retval = 1;
+ break;
+ }
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+/*
+ * When we die, we re-parent all our children.
+ * Try to give them to another thread in our process
+ * group, and if no such member exists, give it to
+ * the global child reaper process (ie "init")
+ */
+static inline void forget_original_parent(struct task_struct * father)
+{
+ struct task_struct * p, *reaper;
+
+ read_lock(&tasklist_lock);
+
+ /* Next in our thread group */
+ reaper = next_thread(father);
+ if (reaper == father)
+ reaper = child_reaper;
+
+ for_each_task(p) {
+ if (p->p_opptr == father) {
+ /* We dont want people slaying init */
+ p->exit_signal = SIGCHLD;
+ p->self_exec_id++;
+ p->p_opptr = reaper;
+ if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static inline void close_files(struct files_struct * files)
+{
+ int i, j;
+
+ j = 0;
+ for (;;) {
+ unsigned long set;
+ i = j * __NFDBITS;
+ if (i >= files->max_fdset || i >= files->max_fds)
+ break;
+ set = files->open_fds->fds_bits[j++];
+ while (set) {
+ if (set & 1) {
+ struct file * file = xchg(&files->fd[i], NULL);
+ if (file)
+ filp_close(file, files);
+ }
+ i++;
+ set >>= 1;
+ }
+ }
+}
+
+void put_files_struct(struct files_struct *files)
+{
+ if (atomic_dec_and_test(&files->count)) {
+ close_files(files);
+ /*
+ * Free the fd and fdset arrays if we expanded them.
+ */
+ if (files->fd != &files->fd_array[0])
+ free_fd_array(files->fd, files->max_fds);
+ if (files->max_fdset > __FD_SETSIZE) {
+ free_fdset(files->open_fds, files->max_fdset);
+ free_fdset(files->close_on_exec, files->max_fdset);
+ }
+ kmem_cache_free(files_cachep, files);
+ }
+}
+
+static inline void __exit_files(struct task_struct *tsk)
+{
+ struct files_struct * files = tsk->files;
+
+ if (files) {
+ task_lock(tsk);
+ tsk->files = NULL;
+ task_unlock(tsk);
+ put_files_struct(files);
+ }
+}
+
+void exit_files(struct task_struct *tsk)
+{
+ __exit_files(tsk);
+}
+
+static inline void __put_fs_struct(struct fs_struct *fs)
+{
+ /* No need to hold fs->lock if we are killing it */
+ if (atomic_dec_and_test(&fs->count)) {
+ dput(fs->root);
+ mntput(fs->rootmnt);
+ dput(fs->pwd);
+ mntput(fs->pwdmnt);
+ if (fs->altroot) {
+ dput(fs->altroot);
+ mntput(fs->altrootmnt);
+ }
+ kmem_cache_free(fs_cachep, fs);
+ }
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+ __put_fs_struct(fs);
+}
+
+static inline void __exit_fs(struct task_struct *tsk)
+{
+ struct fs_struct * fs = tsk->fs;
+
+ if (fs) {
+ task_lock(tsk);
+ tsk->fs = NULL;
+ task_unlock(tsk);
+ __put_fs_struct(fs);
+ }
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+ __exit_fs(tsk);
+}
+
+/*
+ * We can use these to temporarily drop into
+ * "lazy TLB" mode and back.
+ */
+struct mm_struct * start_lazy_tlb(void)
+{
+ struct mm_struct *mm = current->mm;
+ current->mm = NULL;
+ /* active_mm is still 'mm' */
+ atomic_inc(&mm->mm_count);
+ enter_lazy_tlb(mm, current, smp_processor_id());
+ return mm;
+}
+
+void end_lazy_tlb(struct mm_struct *mm)
+{
+ struct mm_struct *active_mm = current->active_mm;
+
+ current->mm = mm;
+ if (mm != active_mm) {
+ current->active_mm = mm;
+ activate_mm(active_mm, mm);
+ }
+ mmdrop(active_mm);
+}
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static inline void __exit_mm(struct task_struct * tsk)
+{
+ struct mm_struct * mm = tsk->mm;
+
+ mm_release();
+ if (mm) {
+ atomic_inc(&mm->mm_count);
+ if (mm != tsk->active_mm) BUG();
+ /* more a memory barrier than a real lock */
+ task_lock(tsk);
+ tsk->mm = NULL;
+ task_unlock(tsk);
+ enter_lazy_tlb(mm, current, smp_processor_id());
+ mmput(mm);
+ }
+}
+
+void exit_mm(struct task_struct *tsk)
+{
+ __exit_mm(tsk);
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(void)
+{
+ struct task_struct * p, *t;
+
+ forget_original_parent(current);
+ /*
+ * Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ *
+ * Case i: Our father is in a different pgrp than we are
+ * and we were the only connection outside, so our pgrp
+ * is about to become orphaned.
+ */
+
+ t = current->p_pptr;
+
+ if ((t->pgrp != current->pgrp) &&
+ (t->session == current->session) &&
+ will_become_orphaned_pgrp(current->pgrp, current) &&
+ has_stopped_jobs(current->pgrp)) {
+ kill_pg(current->pgrp,SIGHUP,1);
+ kill_pg(current->pgrp,SIGCONT,1);
+ }
+
+ /* Let father know we died
+ *
+ * Thread signals are configurable, but you aren't going to use
+ * that to send signals to arbitary processes.
+ * That stops right now.
+ *
+ * If the parent exec id doesn't match the exec id we saved
+ * when we started then we know the parent has changed security
+ * domain.
+ *
+ * If our self_exec id doesn't match our parent_exec_id then
+ * we have changed execution domain as these two values started
+ * the same after a fork.
+ *
+ */
+
+ if(current->exit_signal != SIGCHLD &&
+ ( current->parent_exec_id != t->self_exec_id ||
+ current->self_exec_id != current->parent_exec_id)
+ && !capable(CAP_KILL))
+ current->exit_signal = SIGCHLD;
+
+
+ /*
+ * This loop does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+
+ write_lock_irq(&tasklist_lock);
+ current->state = TASK_ZOMBIE;
+ do_notify_parent(current, current->exit_signal);
+ while (current->p_cptr != NULL) {
+ p = current->p_cptr;
+ current->p_cptr = p->p_osptr;
+ p->p_ysptr = NULL;
+ p->ptrace = 0;
+
+ p->p_pptr = p->p_opptr;
+ p->p_osptr = p->p_pptr->p_cptr;
+ if (p->p_osptr)
+ p->p_osptr->p_ysptr = p;
+ p->p_pptr->p_cptr = p;
+ if (p->state == TASK_ZOMBIE)
+ do_notify_parent(p, p->exit_signal);
+ /*
+ * process group orphan check
+ * Case ii: Our child is in a different pgrp
+ * than we are, and it was the only connection
+ * outside, so the child pgrp is now orphaned.
+ */
+ if ((p->pgrp != current->pgrp) &&
+ (p->session == current->session)) {
+ int pgrp = p->pgrp;
+
+ write_unlock_irq(&tasklist_lock);
+ if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
+ kill_pg(pgrp,SIGHUP,1);
+ kill_pg(pgrp,SIGCONT,1);
+ }
+ write_lock_irq(&tasklist_lock);
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+}
+
+NORET_TYPE void do_exit(long code)
+{
+ struct task_struct *tsk = current;
+
+ if (in_interrupt())
+ panic("Aiee, killing interrupt handler!");
+ if (!tsk->pid)
+ panic("Attempted to kill the idle task!");
+ if (tsk->pid == 1)
+ panic("Attempted to kill init!");
+ tsk->flags |= PF_EXITING;
+ del_timer_sync(&tsk->real_timer);
+
+fake_volatile:
+#ifdef CONFIG_BSD_PROCESS_ACCT
+ acct_process(code);
+#endif
+ __exit_mm(tsk);
+
+ lock_kernel();
+ sem_exit();
+ __exit_files(tsk);
+ __exit_fs(tsk);
+ exit_sighand(tsk);
+ exit_thread();
+
+ if (current->leader)
+ disassociate_ctty(1);
+
+ put_exec_domain(tsk->exec_domain);
+ if (tsk->binfmt && tsk->binfmt->module)
+ __MOD_DEC_USE_COUNT(tsk->binfmt->module);
+
+ tsk->exit_code = code;
+ exit_notify();
+ schedule();
+ BUG();
+/*
+ * In order to get rid of the "volatile function does return" message
+ * I did this little loop that confuses gcc to think do_exit really
+ * is volatile. In fact it's schedule() that is volatile in some
+ * circumstances: when current->state = ZOMBIE, schedule() never
+ * returns.
+ *
+ * In fact the natural way to do all this is to have the label and the
+ * goto right after each other, but I put the fake_volatile label at
+ * the start of the function just in case something /really/ bad
+ * happens, and the schedule returns. This way we can try again. I'm
+ * not paranoid: it's just that everybody is out to get me.
+ */
+ goto fake_volatile;
+}
+
+NORET_TYPE void up_and_exit(struct semaphore *sem, long code)
+{
+ if (sem)
+ up(sem);
+
+ do_exit(code);
+}
+
+asmlinkage long sys_exit(int error_code)
+{
+ do_exit((error_code&0xff)<<8);
+}
+
+asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru)
+{
+ int flag, retval;
+ DECLARE_WAITQUEUE(wait, current);
+ struct task_struct *tsk;
+
+ if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ add_wait_queue(&current->wait_chldexit,&wait);
+repeat:
+ flag = 0;
+ current->state = TASK_INTERRUPTIBLE;
+ read_lock(&tasklist_lock);
+ tsk = current;
+ do {
+ struct task_struct *p;
+ for (p = tsk->p_cptr ; p ; p = p->p_osptr) {
+ if (pid>0) {
+ if (p->pid != pid)
+ continue;
+ } else if (!pid) {
+ if (p->pgrp != current->pgrp)
+ continue;
+ } else if (pid != -1) {
+ if (p->pgrp != -pid)
+ continue;
+ }
+ /* Wait for all children (clone and not) if __WALL is set;
+ * otherwise, wait for clone children *only* if __WCLONE is
+ * set; otherwise, wait for non-clone children *only*. (Note:
+ * A "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD.) */
+ if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+ && !(options & __WALL))
+ continue;
+ flag = 1;
+ switch (p->state) {
+ case TASK_STOPPED:
+ if (!p->exit_code)
+ continue;
+ if (!(options & WUNTRACED) && !(p->ptrace & PT_PTRACED))
+ continue;
+ read_unlock(&tasklist_lock);
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!retval && stat_addr)
+ retval = put_user((p->exit_code << 8) | 0x7f, stat_addr);
+ if (!retval) {
+ p->exit_code = 0;
+ retval = p->pid;
+ }
+ goto end_wait4;
+ case TASK_ZOMBIE:
+ current->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime;
+ current->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime;
+ read_unlock(&tasklist_lock);
+ retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!retval && stat_addr)
+ retval = put_user(p->exit_code, stat_addr);
+ if (retval)
+ goto end_wait4;
+ retval = p->pid;
+ if (p->p_opptr != p->p_pptr) {
+ write_lock_irq(&tasklist_lock);
+ REMOVE_LINKS(p);
+ p->p_pptr = p->p_opptr;
+ SET_LINKS(p);
+ do_notify_parent(p, SIGCHLD);
+ write_unlock_irq(&tasklist_lock);
+ } else
+ release_task(p);
+ goto end_wait4;
+ default:
+ continue;
+ }
+ }
+ if (options & __WNOTHREAD)
+ break;
+ tsk = next_thread(tsk);
+ } while (tsk != current);
+ read_unlock(&tasklist_lock);
+ if (flag) {
+ retval = 0;
+ if (options & WNOHANG)
+ goto end_wait4;
+ retval = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto end_wait4;
+ schedule();
+ goto repeat;
+ }
+ retval = -ECHILD;
+end_wait4:
+ current->state = TASK_RUNNING;
+ remove_wait_queue(&current->wait_chldexit,&wait);
+ return retval;
+}
+
+#if !defined(__alpha__) && !defined(__ia64__)
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+asmlinkage long sys_waitpid(pid_t pid,unsigned int * stat_addr, int options)
+{
+ return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
new file mode 100644
index 000000000000..99c1f2317992
--- /dev/null
+++ b/kernel/fork.c
@@ -0,0 +1,771 @@
+/*
+ * linux/kernel/fork.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * 'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_tables()'
+ */
+
+#include <linux/config.h>
+#include <linux/malloc.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+
+/* The idle threads do not count.. */
+int nr_threads;
+int nr_running;
+
+int max_threads;
+unsigned long total_forks; /* Handle normal Linux uptimes. */
+int last_pid;
+
+struct task_struct *pidhash[PIDHASH_SZ];
+
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
+{
+ unsigned long flags;
+
+ wq_write_lock_irqsave(&q->lock, flags);
+ wait->flags = 0;
+ __add_wait_queue(q, wait);
+ wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
+{
+ unsigned long flags;
+
+ wq_write_lock_irqsave(&q->lock, flags);
+ wait->flags = WQ_FLAG_EXCLUSIVE;
+ __add_wait_queue_tail(q, wait);
+ wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
+{
+ unsigned long flags;
+
+ wq_write_lock_irqsave(&q->lock, flags);
+ __remove_wait_queue(q, wait);
+ wq_write_unlock_irqrestore(&q->lock, flags);
+}
+
+void __init fork_init(unsigned long mempages)
+{
+ /*
+ * The default maximum number of threads is set to a safe
+ * value: the thread structures can take up at most half
+ * of memory.
+ */
+ max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 2;
+
+ init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+ init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+}
+
+/* Protects next_safe and last_pid. */
+spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
+
+static int get_pid(unsigned long flags)
+{
+ static int next_safe = PID_MAX;
+ struct task_struct *p;
+
+ if (flags & CLONE_PID)
+ return current->pid;
+
+ spin_lock(&lastpid_lock);
+ if((++last_pid) & 0xffff8000) {
+ last_pid = 300; /* Skip daemons etc. */
+ goto inside;
+ }
+ if(last_pid >= next_safe) {
+inside:
+ next_safe = PID_MAX;
+ read_lock(&tasklist_lock);
+ repeat:
+ for_each_task(p) {
+ if(p->pid == last_pid ||
+ p->pgrp == last_pid ||
+ p->session == last_pid) {
+ if(++last_pid >= next_safe) {
+ if(last_pid & 0xffff8000)
+ last_pid = 300;
+ next_safe = PID_MAX;
+ }
+ goto repeat;
+ }
+ if(p->pid > last_pid && next_safe > p->pid)
+ next_safe = p->pid;
+ if(p->pgrp > last_pid && next_safe > p->pgrp)
+ next_safe = p->pgrp;
+ if(p->session > last_pid && next_safe > p->session)
+ next_safe = p->session;
+ }
+ read_unlock(&tasklist_lock);
+ }
+ spin_unlock(&lastpid_lock);
+
+ return last_pid;
+}
+
+static inline int dup_mmap(struct mm_struct * mm)
+{
+ struct vm_area_struct * mpnt, *tmp, **pprev;
+ int retval;
+
+ flush_cache_mm(current->mm);
+ mm->locked_vm = 0;
+ mm->mmap = NULL;
+ mm->mmap_avl = NULL;
+ mm->mmap_cache = NULL;
+ mm->map_count = 0;
+ mm->cpu_vm_mask = 0;
+ mm->swap_cnt = 0;
+ mm->swap_address = 0;
+ pprev = &mm->mmap;
+ for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+ struct file *file;
+
+ retval = -ENOMEM;
+ if(mpnt->vm_flags & VM_DONTCOPY)
+ continue;
+ tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!tmp)
+ goto fail_nomem;
+ *tmp = *mpnt;
+ tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_mm = mm;
+ mm->map_count++;
+ tmp->vm_next = NULL;
+ file = tmp->vm_file;
+ if (file) {
+ struct inode *inode = file->f_dentry->d_inode;
+ get_file(file);
+ if (tmp->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+
+ /* insert tmp into the share list, just after mpnt */
+ spin_lock(&inode->i_mapping->i_shared_lock);
+ if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
+ mpnt->vm_next_share->vm_pprev_share =
+ &tmp->vm_next_share;
+ mpnt->vm_next_share = tmp;
+ tmp->vm_pprev_share = &mpnt->vm_next_share;
+ spin_unlock(&inode->i_mapping->i_shared_lock);
+ }
+
+ /* Copy the pages, but defer checking for errors */
+ retval = copy_page_range(mm, current->mm, tmp);
+ if (!retval && tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ /*
+ * Link in the new vma even if an error occurred,
+ * so that exit_mmap() can clean up the mess.
+ */
+ *pprev = tmp;
+ pprev = &tmp->vm_next;
+
+ if (retval)
+ goto fail_nomem;
+ }
+ retval = 0;
+ if (mm->map_count >= AVL_MIN_MAP_COUNT)
+ build_mmap_avl(mm);
+
+fail_nomem:
+ flush_tlb_mm(current->mm);
+ return retval;
+}
+
+spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
+
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+static struct mm_struct * mm_init(struct mm_struct * mm)
+{
+ atomic_set(&mm->mm_users, 1);
+ atomic_set(&mm->mm_count, 1);
+ init_MUTEX(&mm->mmap_sem);
+ mm->page_table_lock = SPIN_LOCK_UNLOCKED;
+ mm->pgd = pgd_alloc();
+ if (mm->pgd)
+ return mm;
+ free_mm(mm);
+ return NULL;
+}
+
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct * mm_alloc(void)
+{
+ struct mm_struct * mm;
+
+ mm = allocate_mm();
+ if (mm) {
+ memset(mm, 0, sizeof(*mm));
+ return mm_init(mm);
+ }
+ return NULL;
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+inline void __mmdrop(struct mm_struct *mm)
+{
+ if (mm == &init_mm) BUG();
+ pgd_free(mm->pgd);
+ destroy_context(mm);
+ free_mm(mm);
+}
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+ if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ exit_mmap(mm);
+ mmdrop(mm);
+ }
+}
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one. Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(void)
+{
+ struct task_struct *tsk = current;
+
+ /* notify parent sleeping on vfork() */
+ if (tsk->flags & PF_VFORK) {
+ tsk->flags &= ~PF_VFORK;
+ up(tsk->p_opptr->vfork_sem);
+ }
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct mm_struct * mm, *oldmm;
+ int retval;
+
+ tsk->min_flt = tsk->maj_flt = 0;
+ tsk->cmin_flt = tsk->cmaj_flt = 0;
+ tsk->nswap = tsk->cnswap = 0;
+
+ tsk->mm = NULL;
+ tsk->active_mm = NULL;
+
+ /*
+ * Are we cloning a kernel thread?
+ *
+ * We need to steal a active VM for that..
+ */
+ oldmm = current->mm;
+ if (!oldmm)
+ return 0;
+
+ if (clone_flags & CLONE_VM) {
+ atomic_inc(&oldmm->mm_users);
+ mm = oldmm;
+ goto good_mm;
+ }
+
+ retval = -ENOMEM;
+ mm = allocate_mm();
+ if (!mm)
+ goto fail_nomem;
+
+ /* Copy the current MM stuff.. */
+ memcpy(mm, oldmm, sizeof(*mm));
+ if (!mm_init(mm))
+ goto fail_nomem;
+
+ down(&oldmm->mmap_sem);
+ retval = dup_mmap(mm);
+ up(&oldmm->mmap_sem);
+
+ /*
+ * Add it to the mmlist after the parent.
+ *
+ * Doing it this way means that we can order
+ * the list, and fork() won't mess up the
+ * ordering significantly.
+ */
+ spin_lock(&mmlist_lock);
+ list_add(&mm->mmlist, &oldmm->mmlist);
+ spin_unlock(&mmlist_lock);
+
+ if (retval)
+ goto free_pt;
+
+ /*
+ * child gets a private LDT (if there was an LDT in the parent)
+ */
+ copy_segments(tsk, mm);
+
+ if (init_new_context(tsk,mm))
+ goto free_pt;
+
+good_mm:
+ tsk->mm = mm;
+ tsk->active_mm = mm;
+ return 0;
+
+free_pt:
+ mmput(mm);
+fail_nomem:
+ return retval;
+}
+
+static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
+{
+ struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+ /* We don't need to lock fs - think why ;-) */
+ if (fs) {
+ atomic_set(&fs->count, 1);
+ fs->lock = RW_LOCK_UNLOCKED;
+ fs->umask = old->umask;
+ read_lock(&old->lock);
+ fs->rootmnt = mntget(old->rootmnt);
+ fs->root = dget(old->root);
+ fs->pwdmnt = mntget(old->pwdmnt);
+ fs->pwd = dget(old->pwd);
+ if (old->altroot) {
+ fs->altrootmnt = mntget(old->altrootmnt);
+ fs->altroot = dget(old->altroot);
+ } else {
+ fs->altrootmnt = NULL;
+ fs->altroot = NULL;
+ }
+ read_unlock(&old->lock);
+ }
+ return fs;
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+ return __copy_fs_struct(old);
+}
+
+static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
+{
+ if (clone_flags & CLONE_FS) {
+ atomic_inc(&current->fs->count);
+ return 0;
+ }
+ tsk->fs = __copy_fs_struct(current->fs);
+ if (!tsk->fs)
+ return -1;
+ return 0;
+}
+
+static int count_open_files(struct files_struct *files, int size)
+{
+ int i;
+
+ /* Find the last open fd */
+ for (i = size/(8*sizeof(long)); i > 0; ) {
+ if (files->open_fds->fds_bits[--i])
+ break;
+ }
+ i = (i+1) * 8 * sizeof(long);
+ return i;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct files_struct *oldf, *newf;
+ struct file **old_fds, **new_fds;
+ int open_files, nfds, size, i, error = 0;
+
+ /*
+ * A background process may not have any files ...
+ */
+ oldf = current->files;
+ if (!oldf)
+ goto out;
+
+ if (clone_flags & CLONE_FILES) {
+ atomic_inc(&oldf->count);
+ goto out;
+ }
+
+ tsk->files = NULL;
+ error = -ENOMEM;
+ newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+ if (!newf)
+ goto out;
+
+ atomic_set(&newf->count, 1);
+
+ newf->file_lock = RW_LOCK_UNLOCKED;
+ newf->next_fd = 0;
+ newf->max_fds = NR_OPEN_DEFAULT;
+ newf->max_fdset = __FD_SETSIZE;
+ newf->close_on_exec = &newf->close_on_exec_init;
+ newf->open_fds = &newf->open_fds_init;
+ newf->fd = &newf->fd_array[0];
+
+ /* We don't yet have the oldf readlock, but even if the old
+ fdset gets grown now, we'll only copy up to "size" fds */
+ size = oldf->max_fdset;
+ if (size > __FD_SETSIZE) {
+ newf->max_fdset = 0;
+ write_lock(&newf->file_lock);
+ error = expand_fdset(newf, size);
+ write_unlock(&newf->file_lock);
+ if (error)
+ goto out_release;
+ }
+ read_lock(&oldf->file_lock);
+
+ open_files = count_open_files(oldf, size);
+
+ /*
+ * Check whether we need to allocate a larger fd array.
+ * Note: we're not a clone task, so the open count won't
+ * change.
+ */
+ nfds = NR_OPEN_DEFAULT;
+ if (open_files > nfds) {
+ read_unlock(&oldf->file_lock);
+ newf->max_fds = 0;
+ write_lock(&newf->file_lock);
+ error = expand_fd_array(newf, open_files);
+ write_unlock(&newf->file_lock);
+ if (error)
+ goto out_release;
+ nfds = newf->max_fds;
+ read_lock(&oldf->file_lock);
+ }
+
+ old_fds = oldf->fd;
+ new_fds = newf->fd;
+
+ memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
+ memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+
+ for (i = open_files; i != 0; i--) {
+ struct file *f = *old_fds++;
+ if (f)
+ get_file(f);
+ *new_fds++ = f;
+ }
+ read_unlock(&oldf->file_lock);
+
+ /* compute the remainder to be cleared */
+ size = (newf->max_fds - open_files) * sizeof(struct file *);
+
+ /* This is long word aligned thus could use a optimized version */
+ memset(new_fds, 0, size);
+
+ if (newf->max_fdset > open_files) {
+ int left = (newf->max_fdset-open_files)/8;
+ int start = open_files / (8 * sizeof(unsigned long));
+
+ memset(&newf->open_fds->fds_bits[start], 0, left);
+ memset(&newf->close_on_exec->fds_bits[start], 0, left);
+ }
+
+ tsk->files = newf;
+ error = 0;
+out:
+ return error;
+
+out_release:
+ free_fdset (newf->close_on_exec, newf->max_fdset);
+ free_fdset (newf->open_fds, newf->max_fdset);
+ kmem_cache_free(files_cachep, newf);
+ goto out;
+}
+
+static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
+{
+ struct signal_struct *sig;
+
+ if (clone_flags & CLONE_SIGHAND) {
+ atomic_inc(&current->sig->count);
+ return 0;
+ }
+ sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
+ tsk->sig = sig;
+ if (!sig)
+ return -1;
+ spin_lock_init(&sig->siglock);
+ atomic_set(&sig->count, 1);
+ memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
+ return 0;
+}
+
+static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
+{
+ unsigned long new_flags = p->flags;
+
+ new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
+ new_flags |= PF_FORKNOEXEC;
+ if (!(clone_flags & CLONE_PTRACE))
+ p->ptrace = 0;
+ if (clone_flags & CLONE_VFORK)
+ new_flags |= PF_VFORK;
+ p->flags = new_flags;
+}
+
+/*
+ * Ok, this is the main fork-routine. It copies the system process
+ * information (task[nr]) and sets up the necessary registers. It also
+ * copies the data segment in its entirety. The "stack_start" and
+ * "stack_top" arguments are simply passed along to the platform
+ * specific copy_thread() routine. Most platforms ignore stack_top.
+ * For an example that's using stack_top, see
+ * arch/ia64/kernel/process.c.
+ */
+int do_fork(unsigned long clone_flags, unsigned long stack_start,
+ struct pt_regs *regs, unsigned long stack_size)
+{
+ int retval = -ENOMEM;
+ struct task_struct *p;
+ DECLARE_MUTEX_LOCKED(sem);
+
+ if (clone_flags & CLONE_PID) {
+ /* This is only allowed from the boot up thread */
+ if (current->pid)
+ return -EPERM;
+ }
+
+ current->vfork_sem = &sem;
+
+ p = alloc_task_struct();
+ if (!p)
+ goto fork_out;
+
+ *p = *current;
+
+ retval = -EAGAIN;
+ if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
+ goto bad_fork_free;
+ atomic_inc(&p->user->__count);
+ atomic_inc(&p->user->processes);
+
+ /*
+ * Counter increases are protected by
+ * the kernel lock so nr_threads can't
+ * increase under us (but it may decrease).
+ */
+ if (nr_threads >= max_threads)
+ goto bad_fork_cleanup_count;
+
+ get_exec_domain(p->exec_domain);
+
+ if (p->binfmt && p->binfmt->module)
+ __MOD_INC_USE_COUNT(p->binfmt->module);
+
+ p->did_exec = 0;
+ p->swappable = 0;
+ p->state = TASK_UNINTERRUPTIBLE;
+
+ copy_flags(clone_flags, p);
+ p->pid = get_pid(clone_flags);
+
+ p->run_list.next = NULL;
+ p->run_list.prev = NULL;
+
+ if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
+ p->p_opptr = current;
+ if (!(p->ptrace & PT_PTRACED))
+ p->p_pptr = current;
+ }
+ p->p_cptr = NULL;
+ init_waitqueue_head(&p->wait_chldexit);
+ p->vfork_sem = NULL;
+ spin_lock_init(&p->alloc_lock);
+
+ p->sigpending = 0;
+ init_sigpending(&p->pending);
+
+ p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
+ p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
+ init_timer(&p->real_timer);
+ p->real_timer.data = (unsigned long) p;
+
+ p->leader = 0; /* session leadership doesn't inherit */
+ p->tty_old_pgrp = 0;
+ p->times.tms_utime = p->times.tms_stime = 0;
+ p->times.tms_cutime = p->times.tms_cstime = 0;
+#ifdef CONFIG_SMP
+ {
+ int i;
+ p->has_cpu = 0;
+ p->processor = current->processor;
+ /* ?? should we just memset this ?? */
+ for(i = 0; i < smp_num_cpus; i++)
+ p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
+ spin_lock_init(&p->sigmask_lock);
+ }
+#endif
+ p->lock_depth = -1; /* -1 = no lock */
+ p->start_time = jiffies;
+
+ retval = -ENOMEM;
+ /* copy all the process information */
+ if (copy_files(clone_flags, p))
+ goto bad_fork_cleanup;
+ if (copy_fs(clone_flags, p))
+ goto bad_fork_cleanup_files;
+ if (copy_sighand(clone_flags, p))
+ goto bad_fork_cleanup_fs;
+ if (copy_mm(clone_flags, p))
+ goto bad_fork_cleanup_sighand;
+ retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+ if (retval)
+ goto bad_fork_cleanup_sighand;
+ p->semundo = NULL;
+
+ /* Our parent execution domain becomes current domain
+ These must match for thread signalling to apply */
+
+ p->parent_exec_id = p->self_exec_id;
+
+ /* ok, now we should be set up.. */
+ p->swappable = 1;
+ p->exit_signal = clone_flags & CSIGNAL;
+ p->pdeath_signal = 0;
+
+ /*
+ * "share" dynamic priority between parent and child, thus the
+ * total amount of dynamic priorities in the system doesnt change,
+ * more scheduling fairness. This is only important in the first
+ * timeslice, on the long run the scheduling behaviour is unchanged.
+ */
+ p->counter = (current->counter + 1) >> 1;
+ current->counter >>= 1;
+ if (!current->counter)
+ current->need_resched = 1;
+
+ /*
+ * Ok, add it to the run-queues and make it
+ * visible to the rest of the system.
+ *
+ * Let it rip!
+ */
+ retval = p->pid;
+ p->tgid = retval;
+ INIT_LIST_HEAD(&p->thread_group);
+ write_lock_irq(&tasklist_lock);
+ if (clone_flags & CLONE_THREAD) {
+ p->tgid = current->tgid;
+ list_add(&p->thread_group, &current->thread_group);
+ }
+ SET_LINKS(p);
+ hash_pid(p);
+ nr_threads++;
+ write_unlock_irq(&tasklist_lock);
+
+ if (p->ptrace & PT_PTRACED)
+ send_sig(SIGSTOP, p, 1);
+
+ wake_up_process(p); /* do this last */
+ ++total_forks;
+
+fork_out:
+ if ((clone_flags & CLONE_VFORK) && (retval > 0))
+ down(&sem);
+ return retval;
+
+bad_fork_cleanup_sighand:
+ exit_sighand(p);
+bad_fork_cleanup_fs:
+ exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+ exit_files(p); /* blocking */
+bad_fork_cleanup:
+ put_exec_domain(p->exec_domain);
+ if (p->binfmt && p->binfmt->module)
+ __MOD_DEC_USE_COUNT(p->binfmt->module);
+bad_fork_cleanup_count:
+ atomic_dec(&p->user->processes);
+ free_uid(p->user);
+bad_fork_free:
+ free_task_struct(p);
+ goto fork_out;
+}
+
+/* SLAB cache for signal_struct structures (tsk->sig) */
+kmem_cache_t *sigact_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+kmem_cache_t *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+kmem_cache_t *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+kmem_cache_t *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+kmem_cache_t *mm_cachep;
+
+void __init proc_caches_init(void)
+{
+ sigact_cachep = kmem_cache_create("signal_act",
+ sizeof(struct signal_struct), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!sigact_cachep)
+ panic("Cannot create signal action SLAB cache");
+
+ files_cachep = kmem_cache_create("files_cache",
+ sizeof(struct files_struct), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!files_cachep)
+ panic("Cannot create files SLAB cache");
+
+ fs_cachep = kmem_cache_create("fs_cache",
+ sizeof(struct fs_struct), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!fs_cachep)
+ panic("Cannot create fs_struct SLAB cache");
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if(!vm_area_cachep)
+ panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
+
+ mm_cachep = kmem_cache_create("mm_struct",
+ sizeof(struct mm_struct), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if(!mm_cachep)
+ panic("vma_init: Cannot alloc mm_struct SLAB cache");
+}
diff --git a/kernel/info.c b/kernel/info.c
new file mode 100644
index 000000000000..d7abf6713384
--- /dev/null
+++ b/kernel/info.c
@@ -0,0 +1,74 @@
+/*
+ * linux/kernel/info.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* This implements the sysinfo() system call */
+
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+
+#include <asm/uaccess.h>
+
+asmlinkage long sys_sysinfo(struct sysinfo *info)
+{
+ struct sysinfo val;
+
+ memset((char *)&val, 0, sizeof(struct sysinfo));
+
+ cli();
+ val.uptime = jiffies / HZ;
+
+ val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+ val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+ val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+ val.procs = nr_threads-1;
+ sti();
+
+ si_meminfo(&val);
+ si_swapinfo(&val);
+
+ {
+ /* If the sum of all the available memory (i.e. ram + swap +
+ * highmem) is less then can be stored in a 32 bit unsigned long
+ * then we can be binary compatible with 2.2.x kernels. If not,
+ * well, who cares since in that case 2.2.x was broken anyways...
+ *
+ * -Erik Andersen <andersee@debian.org> */
+
+ unsigned long mem_total = val.totalram + val.totalswap;
+ if ( !(mem_total < val.totalram || mem_total < val.totalswap)) {
+ unsigned long mem_total2 = mem_total + val.totalhigh;
+ if (!(mem_total2 < mem_total || mem_total2 < val.totalhigh))
+ {
+ /* If mem_total did not overflow. Divide all memory values by
+ * mem_unit and set mem_unit=1. This leaves things compatible with
+ * 2.2.x, and also retains compatibility with earlier 2.4.x
+ * kernels... */
+
+ int bitcount = 0;
+ while (val.mem_unit > 1)
+ {
+ bitcount++;
+ val.mem_unit >>= 1;
+ }
+ val.totalram <<= bitcount;
+ val.freeram <<= bitcount;
+ val.sharedram <<= bitcount;
+ val.bufferram <<= bitcount;
+ val.totalswap <<= bitcount;
+ val.freeswap <<= bitcount;
+ val.totalhigh <<= bitcount;
+ val.freehigh <<= bitcount;
+ }
+ }
+ }
+
+ if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+ return -EFAULT;
+ return 0;
+}
diff --git a/kernel/itimer.c b/kernel/itimer.c
new file mode 100644
index 000000000000..79d58220c590
--- /dev/null
+++ b/kernel/itimer.c
@@ -0,0 +1,170 @@
+/*
+ * linux/kernel/itimer.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* These are all the functions necessary to implement itimers */
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * change timeval to jiffies, trying to avoid the
+ * most obvious overflows..
+ *
+ * The tv_*sec values are signed, but nothing seems to
+ * indicate whether we really should use them as signed values
+ * when doing itimers. POSIX doesn't mention this (but if
+ * alarm() uses itimers without checking, we have to use unsigned
+ * arithmetic).
+ */
+static unsigned long tvtojiffies(struct timeval *value)
+{
+ unsigned long sec = (unsigned) value->tv_sec;
+ unsigned long usec = (unsigned) value->tv_usec;
+
+ if (sec > (ULONG_MAX / HZ))
+ return ULONG_MAX;
+ usec += 1000000 / HZ - 1;
+ usec /= 1000000 / HZ;
+ return HZ*sec+usec;
+}
+
+static void jiffiestotv(unsigned long jiffies, struct timeval *value)
+{
+ value->tv_usec = (jiffies % HZ) * (1000000 / HZ);
+ value->tv_sec = jiffies / HZ;
+}
+
+int do_getitimer(int which, struct itimerval *value)
+{
+ register unsigned long val, interval;
+
+ switch (which) {
+ case ITIMER_REAL:
+ interval = current->it_real_incr;
+ val = 0;
+ /*
+ * FIXME! This needs to be atomic, in case the kernel timer happens!
+ */
+ if (timer_pending(&current->real_timer)) {
+ val = current->real_timer.expires - jiffies;
+
+ /* look out for negative/zero itimer.. */
+ if ((long) val <= 0)
+ val = 1;
+ }
+ break;
+ case ITIMER_VIRTUAL:
+ val = current->it_virt_value;
+ interval = current->it_virt_incr;
+ break;
+ case ITIMER_PROF:
+ val = current->it_prof_value;
+ interval = current->it_prof_incr;
+ break;
+ default:
+ return(-EINVAL);
+ }
+ jiffiestotv(val, &value->it_value);
+ jiffiestotv(interval, &value->it_interval);
+ return 0;
+}
+
+/* SMP: Only we modify our itimer values. */
+asmlinkage long sys_getitimer(int which, struct itimerval *value)
+{
+ int error = -EFAULT;
+ struct itimerval get_buffer;
+
+ if (value) {
+ error = do_getitimer(which, &get_buffer);
+ if (!error &&
+ copy_to_user(value, &get_buffer, sizeof(get_buffer)))
+ error = -EFAULT;
+ }
+ return error;
+}
+
+void it_real_fn(unsigned long __data)
+{
+ struct task_struct * p = (struct task_struct *) __data;
+ unsigned long interval;
+
+ send_sig(SIGALRM, p, 1);
+ interval = p->it_real_incr;
+ if (interval) {
+ if (interval > (unsigned long) LONG_MAX)
+ interval = LONG_MAX;
+ p->real_timer.expires = jiffies + interval;
+ add_timer(&p->real_timer);
+ }
+}
+
+int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+{
+ register unsigned long i, j;
+ int k;
+
+ i = tvtojiffies(&value->it_interval);
+ j = tvtojiffies(&value->it_value);
+ if (ovalue && (k = do_getitimer(which, ovalue)) < 0)
+ return k;
+ switch (which) {
+ case ITIMER_REAL:
+ del_timer_sync(&current->real_timer);
+ current->it_real_value = j;
+ current->it_real_incr = i;
+ if (!j)
+ break;
+ if (j > (unsigned long) LONG_MAX)
+ j = LONG_MAX;
+ i = j + jiffies;
+ current->real_timer.expires = i;
+ add_timer(&current->real_timer);
+ break;
+ case ITIMER_VIRTUAL:
+ if (j)
+ j++;
+ current->it_virt_value = j;
+ current->it_virt_incr = i;
+ break;
+ case ITIMER_PROF:
+ if (j)
+ j++;
+ current->it_prof_value = j;
+ current->it_prof_incr = i;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* SMP: Again, only we play with our itimers, and signals are SMP safe
+ * now so that is not an issue at all anymore.
+ */
+asmlinkage long sys_setitimer(int which, struct itimerval *value,
+ struct itimerval *ovalue)
+{
+ struct itimerval set_buffer, get_buffer;
+ int error;
+
+ if (value) {
+ if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
+ return -EFAULT;
+ } else
+ memset((char *) &set_buffer, 0, sizeof(set_buffer));
+
+ error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : 0);
+ if (error || !ovalue)
+ return error;
+
+ if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+ return -EFAULT;
+ return 0;
+}
diff --git a/kernel/kmod.c b/kernel/kmod.c
new file mode 100644
index 000000000000..ac840a901f1a
--- /dev/null
+++ b/kernel/kmod.c
@@ -0,0 +1,373 @@
+/*
+ kmod, the new module loader (replaces kerneld)
+ Kirk Petersen
+
+ Reorganized not to be a daemon by Adam Richter, with guidance
+ from Greg Zornetzer.
+
+ Modified to avoid chroot and file sharing problems.
+ Mikael Pettersson
+
+ Limit the concurrent number of kmod modprobes to catch loops from
+ "modprobe needs a service that is in a module".
+ Keith Owens <kaos@ocs.com.au> December 1999
+
+ Unblock all signals when we exec a usermode process.
+ Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
+*/
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/smp_lock.h>
+
+#include <asm/uaccess.h>
+
+extern int max_threads;
+
+static inline void
+use_init_fs_context(void)
+{
+ struct fs_struct *our_fs, *init_fs;
+ struct dentry *root, *pwd;
+ struct vfsmount *rootmnt, *pwdmnt;
+
+ /*
+ * Make modprobe's fs context be a copy of init's.
+ *
+ * We cannot use the user's fs context, because it
+ * may have a different root than init.
+ * Since init was created with CLONE_FS, we can grab
+ * its fs context from "init_task".
+ *
+ * The fs context has to be a copy. If it is shared
+ * with init, then any chdir() call in modprobe will
+ * also affect init and the other threads sharing
+ * init_task's fs context.
+ *
+ * We created the exec_modprobe thread without CLONE_FS,
+ * so we can update the fields in our fs context freely.
+ */
+
+ init_fs = init_task.fs;
+ read_lock(&init_fs->lock);
+ rootmnt = mntget(init_fs->rootmnt);
+ root = dget(init_fs->root);
+ pwdmnt = mntget(init_fs->pwdmnt);
+ pwd = dget(init_fs->pwd);
+ read_unlock(&init_fs->lock);
+
+ /* FIXME - unsafe ->fs access */
+ our_fs = current->fs;
+ our_fs->umask = init_fs->umask;
+ set_fs_root(our_fs, rootmnt, root);
+ set_fs_pwd(our_fs, pwdmnt, pwd);
+ write_lock(&our_fs->lock);
+ if (our_fs->altroot) {
+ struct vfsmount *mnt = our_fs->altrootmnt;
+ struct dentry *dentry = our_fs->altroot;
+ our_fs->altrootmnt = NULL;
+ our_fs->altroot = NULL;
+ write_unlock(&our_fs->lock);
+ dput(dentry);
+ mntput(mnt);
+ } else
+ write_unlock(&our_fs->lock);
+ dput(root);
+ mntput(rootmnt);
+ dput(pwd);
+ mntput(pwdmnt);
+}
+
+int exec_usermodehelper(char *program_path, char *argv[], char *envp[])
+{
+ int i;
+ struct task_struct *curtask = current;
+
+ curtask->session = 1;
+ curtask->pgrp = 1;
+
+ use_init_fs_context();
+
+ /* Prevent parent user process from sending signals to child.
+ Otherwise, if the modprobe program does not exist, it might
+ be possible to get a user defined signal handler to execute
+ as the super user right after the execve fails if you time
+ the signal just right.
+ */
+ spin_lock_irq(&curtask->sigmask_lock);
+ sigemptyset(&curtask->blocked);
+ flush_signals(curtask);
+ flush_signal_handlers(curtask);
+ recalc_sigpending(curtask);
+ spin_unlock_irq(&curtask->sigmask_lock);
+
+ for (i = 0; i < curtask->files->max_fds; i++ ) {
+ if (curtask->files->fd[i]) close(i);
+ }
+
+ /* Drop the "current user" thing */
+ {
+ struct user_struct *user = curtask->user;
+ curtask->user = INIT_USER;
+ atomic_inc(&INIT_USER->__count);
+ atomic_inc(&INIT_USER->processes);
+ atomic_dec(&user->processes);
+ free_uid(user);
+ }
+
+ /* Give kmod all effective privileges.. */
+ curtask->euid = curtask->fsuid = 0;
+ curtask->egid = curtask->fsgid = 0;
+ cap_set_full(curtask->cap_effective);
+
+ /* Allow execve args to be in kernel space. */
+ set_fs(KERNEL_DS);
+
+ /* Go, go, go... */
+ if (execve(program_path, argv, envp) < 0)
+ return -errno;
+ return 0;
+}
+
+#ifdef CONFIG_KMOD
+
+/*
+ modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[256] = "/sbin/modprobe";
+
+static int exec_modprobe(void * module_name)
+{
+ static char * envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+ char *argv[] = { modprobe_path, "-s", "-k", "--", (char*)module_name, NULL };
+ int ret;
+
+ ret = exec_usermodehelper(modprobe_path, argv, envp);
+ if (ret) {
+ printk(KERN_ERR
+ "kmod: failed to exec %s -s -k %s, errno = %d\n",
+ modprobe_path, (char*) module_name, errno);
+ }
+ return ret;
+}
+
+/**
+ * request_module - try to load a kernel module
+ * @module_name: Name of module
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code on failure. Note that a
+ * successful module load does not mean the module did not then unload
+ * and exit on an error of its own. Callers must check that the service
+ * they requested is now available not blindly invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * becomes a no-operation.
+ */
+
+int request_module(const char * module_name)
+{
+ pid_t pid;
+ int waitpid_result;
+ sigset_t tmpsig;
+ int i;
+ static atomic_t kmod_concurrent = ATOMIC_INIT(0);
+#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
+ static int kmod_loop_msg;
+
+ /* Don't allow request_module() before the root fs is mounted! */
+ if ( ! current->fs->root ) {
+ printk(KERN_ERR "request_module[%s]: Root fs not mounted\n",
+ module_name);
+ return -EPERM;
+ }
+
+ /* If modprobe needs a service that is in a module, we get a recursive
+ * loop. Limit the number of running kmod threads to max_threads/2 or
+ * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
+ * would be to run the parents of this process, counting how many times
+ * kmod was invoked. That would mean accessing the internals of the
+ * process tables to get the command line, proc_pid_cmdline is static
+ * and it is not worth changing the proc code just to handle this case.
+ * KAO.
+ */
+ i = max_threads/2;
+ if (i > MAX_KMOD_CONCURRENT)
+ i = MAX_KMOD_CONCURRENT;
+ atomic_inc(&kmod_concurrent);
+ if (atomic_read(&kmod_concurrent) > i) {
+ if (kmod_loop_msg++ < 5)
+ printk(KERN_ERR
+ "kmod: runaway modprobe loop assumed and stopped\n");
+ atomic_dec(&kmod_concurrent);
+ return -ENOMEM;
+ }
+
+ pid = kernel_thread(exec_modprobe, (void*) module_name, 0);
+ if (pid < 0) {
+ printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid);
+ atomic_dec(&kmod_concurrent);
+ return pid;
+ }
+
+ /* Block everything but SIGKILL/SIGSTOP */
+ spin_lock_irq(&current->sigmask_lock);
+ tmpsig = current->blocked;
+ siginitsetinv(&current->blocked, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+
+ waitpid_result = waitpid(pid, NULL, __WCLONE);
+ atomic_dec(&kmod_concurrent);
+
+ /* Allow signals again.. */
+ spin_lock_irq(&current->sigmask_lock);
+ current->blocked = tmpsig;
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+
+ if (waitpid_result != pid) {
+ printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n",
+ module_name, pid, -waitpid_result);
+ }
+ return 0;
+}
+#endif /* CONFIG_KMOD */
+
+
+#ifdef CONFIG_HOTPLUG
+/*
+ hotplug path is set via /proc/sys
+ invoked by hotplug-aware bus drivers,
+ with exec_usermodehelper and some thread-spawner
+
+ argv [0] = hotplug_path;
+ argv [1] = "usb", "scsi", "pci", "network", etc;
+ ... plus optional type-specific parameters
+ argv [n] = 0;
+
+ envp [*] = HOME, PATH; optional type-specific parameters
+
+ a hotplug bus should invoke this for device add/remove
+ events. the command is expected to load drivers when
+ necessary, and may perform additional system setup.
+*/
+char hotplug_path[256] = "/sbin/hotplug";
+
+EXPORT_SYMBOL(hotplug_path);
+
+#endif /* CONFIG_HOTPLUG */
+
+struct subprocess_info {
+ struct semaphore *sem;
+ char *path;
+ char **argv;
+ char **envp;
+ pid_t retval;
+};
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int ____call_usermodehelper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ int retval;
+
+ retval = -EPERM;
+ if (current->fs->root)
+ retval = exec_usermodehelper(sub_info->path, sub_info->argv, sub_info->envp);
+
+ /* Exec failed? */
+ sub_info->retval = (pid_t)retval;
+ do_exit(0);
+}
+
+/*
+ * This is run by keventd.
+ */
+static void __call_usermodehelper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ pid_t pid;
+
+ /*
+ * CLONE_VFORK: wait until the usermode helper has execve'd successfully
+ * We need the data structures to stay around until that is done.
+ */
+ pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD);
+ if (pid < 0)
+ sub_info->retval = pid;
+ up(sub_info->sem);
+}
+
+/**
+ * call_usermodehelper - start a usermode application
+ * @path: pathname for the application
+ * @argv: null-terminated argument list
+ * @envp: null-terminated environment list
+ *
+ * Runs a user-space application. The application is started asynchronously. It
+ * runs as a child of keventd. It runs with full root capabilities. keventd silently
+ * reaps the child when it exits.
+ *
+ * Must be called from process context. Returns zero on success, else a negative
+ * error code.
+ */
+int call_usermodehelper(char *path, char **argv, char **envp)
+{
+ DECLARE_MUTEX_LOCKED(sem);
+ struct subprocess_info sub_info = {
+ sem: &sem,
+ path: path,
+ argv: argv,
+ envp: envp,
+ retval: 0,
+ };
+ struct tq_struct tqs = {
+ routine: __call_usermodehelper,
+ data: &sub_info,
+ };
+
+ if (path[0] == '\0')
+ goto out;
+
+ if (current_is_keventd()) {
+ /* We can't wait on keventd! */
+ __call_usermodehelper(&sub_info);
+ } else {
+ schedule_task(&tqs);
+ down(&sem); /* Wait until keventd has started the subprocess */
+ }
+out:
+ return sub_info.retval;
+}
+
+/*
+ * This is for the serialisation of device probe() functions
+ * against device open() functions
+ */
+static DECLARE_MUTEX(dev_probe_sem);
+
+void dev_probe_lock(void)
+{
+ down(&dev_probe_sem);
+}
+
+void dev_probe_unlock(void)
+{
+ up(&dev_probe_sem);
+}
+
+EXPORT_SYMBOL(exec_usermodehelper);
+EXPORT_SYMBOL(call_usermodehelper);
+
+#ifdef CONFIG_KMOD
+EXPORT_SYMBOL(request_module);
+#endif
+
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
new file mode 100644
index 000000000000..8afe07cca5fb
--- /dev/null
+++ b/kernel/ksyms.c
@@ -0,0 +1,538 @@
+/*
+ * Herein lies all the functions/variables that are "exported" for linkage
+ * with dynamically loaded kernel modules.
+ * Jon.
+ *
+ * - Stacked module support and unified symbol table added (June 1994)
+ * - External symbol table support added (December 1994)
+ * - Versions on symbols added (December 1994)
+ * by Bjorn Ekwall <bj0rn@blox.se>
+ */
+
+#include <linux/config.h>
+#include <linux/malloc.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/kernel_stat.h>
+#include <linux/vmalloc.h>
+#include <linux/sys.h>
+#include <linux/utsname.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/serial.h>
+#include <linux/locks.h>
+#include <linux/delay.h>
+#include <linux/minix_fs.h>
+#include <linux/ext2_fs.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/pagemap.h>
+#include <linux/sysctl.h>
+#include <linux/hdreg.h>
+#include <linux/skbuff.h>
+#include <linux/genhd.h>
+#include <linux/blkpg.h>
+#include <linux/swap.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/iobuf.h>
+#include <linux/console.h>
+#include <linux/poll.h>
+#include <linux/mmzone.h>
+#include <linux/mm.h>
+#include <linux/capability.h>
+#include <linux/highuid.h>
+#include <linux/brlock.h>
+#include <linux/fs.h>
+
+#if defined(CONFIG_PROC_FS)
+#include <linux/proc_fs.h>
+#endif
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+extern void set_device_ro(kdev_t dev,int flag);
+
+extern void *sys_call_table;
+
+extern int sys_tz;
+extern int request_dma(unsigned int dmanr, char * deviceID);
+extern void free_dma(unsigned int dmanr);
+extern spinlock_t dma_spin_lock;
+
+#ifdef CONFIG_MODVERSIONS
+const struct module_symbol __export_Using_Versions
+__attribute__((section("__ksymtab"))) = {
+ 1 /* Version version */, "Using_Versions"
+};
+#endif
+
+
+EXPORT_SYMBOL(inter_module_register);
+EXPORT_SYMBOL(inter_module_unregister);
+EXPORT_SYMBOL(inter_module_get);
+EXPORT_SYMBOL(inter_module_get_request);
+EXPORT_SYMBOL(inter_module_put);
+EXPORT_SYMBOL(try_inc_mod_count);
+
+/* process memory management */
+EXPORT_SYMBOL(do_mmap_pgoff);
+EXPORT_SYMBOL(do_munmap);
+EXPORT_SYMBOL(do_brk);
+EXPORT_SYMBOL(exit_mm);
+EXPORT_SYMBOL(exit_files);
+EXPORT_SYMBOL(exit_fs);
+EXPORT_SYMBOL(exit_sighand);
+
+/* internal kernel memory management */
+EXPORT_SYMBOL(__alloc_pages);
+EXPORT_SYMBOL(alloc_pages_node);
+EXPORT_SYMBOL(__get_free_pages);
+EXPORT_SYMBOL(get_zeroed_page);
+EXPORT_SYMBOL(__free_pages);
+EXPORT_SYMBOL(free_pages);
+#ifndef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(contig_page_data);
+#endif
+EXPORT_SYMBOL(num_physpages);
+EXPORT_SYMBOL(kmem_find_general_cachep);
+EXPORT_SYMBOL(kmem_cache_create);
+EXPORT_SYMBOL(kmem_cache_destroy);
+EXPORT_SYMBOL(kmem_cache_shrink);
+EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_free);
+EXPORT_SYMBOL(kmalloc);
+EXPORT_SYMBOL(kfree);
+EXPORT_SYMBOL(vfree);
+EXPORT_SYMBOL(__vmalloc);
+EXPORT_SYMBOL(mem_map);
+EXPORT_SYMBOL(remap_page_range);
+EXPORT_SYMBOL(max_mapnr);
+EXPORT_SYMBOL(high_memory);
+EXPORT_SYMBOL(vmtruncate);
+EXPORT_SYMBOL(find_vma);
+EXPORT_SYMBOL(get_unmapped_area);
+EXPORT_SYMBOL(init_mm);
+EXPORT_SYMBOL(deactivate_page);
+#ifdef CONFIG_HIGHMEM
+EXPORT_SYMBOL(kmap_high);
+EXPORT_SYMBOL(kunmap_high);
+EXPORT_SYMBOL(highmem_start_page);
+#endif
+
+/* filesystem internal functions */
+EXPORT_SYMBOL(def_blk_fops);
+EXPORT_SYMBOL(update_atime);
+EXPORT_SYMBOL(get_fs_type);
+EXPORT_SYMBOL(get_super);
+EXPORT_SYMBOL(get_empty_super);
+EXPORT_SYMBOL(getname);
+EXPORT_SYMBOL(names_cachep);
+EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL(fget);
+EXPORT_SYMBOL(igrab);
+EXPORT_SYMBOL(iunique);
+EXPORT_SYMBOL(iget4);
+EXPORT_SYMBOL(iput);
+EXPORT_SYMBOL(force_delete);
+EXPORT_SYMBOL(follow_up);
+EXPORT_SYMBOL(follow_down);
+EXPORT_SYMBOL(path_init);
+EXPORT_SYMBOL(path_walk);
+EXPORT_SYMBOL(path_release);
+EXPORT_SYMBOL(__user_walk);
+EXPORT_SYMBOL(lookup_one);
+EXPORT_SYMBOL(lookup_hash);
+EXPORT_SYMBOL(sys_close);
+EXPORT_SYMBOL(dcache_lock);
+EXPORT_SYMBOL(d_alloc_root);
+EXPORT_SYMBOL(d_delete);
+EXPORT_SYMBOL(dget_locked);
+EXPORT_SYMBOL(d_validate);
+EXPORT_SYMBOL(d_rehash);
+EXPORT_SYMBOL(d_invalidate); /* May be it will be better in dcache.h? */
+EXPORT_SYMBOL(d_move);
+EXPORT_SYMBOL(d_instantiate);
+EXPORT_SYMBOL(d_alloc);
+EXPORT_SYMBOL(d_lookup);
+EXPORT_SYMBOL(__d_path);
+EXPORT_SYMBOL(mark_buffer_dirty);
+EXPORT_SYMBOL(__mark_buffer_dirty);
+EXPORT_SYMBOL(__mark_inode_dirty);
+EXPORT_SYMBOL(get_empty_filp);
+EXPORT_SYMBOL(init_private_file);
+EXPORT_SYMBOL(filp_open);
+EXPORT_SYMBOL(filp_close);
+EXPORT_SYMBOL(put_filp);
+EXPORT_SYMBOL(files_lock);
+EXPORT_SYMBOL(check_disk_change);
+EXPORT_SYMBOL(__invalidate_buffers);
+EXPORT_SYMBOL(invalidate_inodes);
+EXPORT_SYMBOL(invalidate_inode_pages);
+EXPORT_SYMBOL(truncate_inode_pages);
+EXPORT_SYMBOL(fsync_dev);
+EXPORT_SYMBOL(permission);
+EXPORT_SYMBOL(vfs_permission);
+EXPORT_SYMBOL(inode_setattr);
+EXPORT_SYMBOL(inode_change_ok);
+EXPORT_SYMBOL(write_inode_now);
+EXPORT_SYMBOL(notify_change);
+EXPORT_SYMBOL(get_hardblocksize);
+EXPORT_SYMBOL(set_blocksize);
+EXPORT_SYMBOL(getblk);
+EXPORT_SYMBOL(bdget);
+EXPORT_SYMBOL(bdput);
+EXPORT_SYMBOL(bread);
+EXPORT_SYMBOL(__brelse);
+EXPORT_SYMBOL(__bforget);
+EXPORT_SYMBOL(ll_rw_block);
+EXPORT_SYMBOL(submit_bh);
+EXPORT_SYMBOL(__wait_on_buffer);
+EXPORT_SYMBOL(___wait_on_page);
+EXPORT_SYMBOL(block_write_full_page);
+EXPORT_SYMBOL(block_read_full_page);
+EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(block_sync_page);
+EXPORT_SYMBOL(cont_prepare_write);
+EXPORT_SYMBOL(generic_commit_write);
+EXPORT_SYMBOL(block_truncate_page);
+EXPORT_SYMBOL(generic_block_bmap);
+EXPORT_SYMBOL(generic_file_read);
+EXPORT_SYMBOL(do_generic_file_read);
+EXPORT_SYMBOL(generic_file_write);
+EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_ro_fops);
+EXPORT_SYMBOL(generic_buffer_fdatasync);
+EXPORT_SYMBOL(page_hash_bits);
+EXPORT_SYMBOL(page_hash_table);
+EXPORT_SYMBOL(file_lock_list);
+EXPORT_SYMBOL(locks_init_lock);
+EXPORT_SYMBOL(locks_copy_lock);
+EXPORT_SYMBOL(posix_lock_file);
+EXPORT_SYMBOL(posix_test_lock);
+EXPORT_SYMBOL(posix_block_lock);
+EXPORT_SYMBOL(posix_unblock_lock);
+EXPORT_SYMBOL(locks_mandatory_area);
+EXPORT_SYMBOL(dput);
+EXPORT_SYMBOL(have_submounts);
+EXPORT_SYMBOL(d_find_alias);
+EXPORT_SYMBOL(d_prune_aliases);
+EXPORT_SYMBOL(prune_dcache);
+EXPORT_SYMBOL(shrink_dcache_sb);
+EXPORT_SYMBOL(shrink_dcache_parent);
+EXPORT_SYMBOL(find_inode_number);
+EXPORT_SYMBOL(is_subdir);
+EXPORT_SYMBOL(get_unused_fd);
+EXPORT_SYMBOL(vfs_create);
+EXPORT_SYMBOL(vfs_mkdir);
+EXPORT_SYMBOL(vfs_mknod);
+EXPORT_SYMBOL(vfs_symlink);
+EXPORT_SYMBOL(vfs_link);
+EXPORT_SYMBOL(vfs_rmdir);
+EXPORT_SYMBOL(vfs_unlink);
+EXPORT_SYMBOL(vfs_rename);
+EXPORT_SYMBOL(vfs_statfs);
+EXPORT_SYMBOL(generic_read_dir);
+EXPORT_SYMBOL(__pollwait);
+EXPORT_SYMBOL(poll_freewait);
+EXPORT_SYMBOL(ROOT_DEV);
+EXPORT_SYMBOL(__find_lock_page);
+EXPORT_SYMBOL(grab_cache_page);
+EXPORT_SYMBOL(read_cache_page);
+EXPORT_SYMBOL(vfs_readlink);
+EXPORT_SYMBOL(vfs_follow_link);
+EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_follow_link);
+EXPORT_SYMBOL(page_symlink_inode_operations);
+EXPORT_SYMBOL(block_symlink);
+EXPORT_SYMBOL(vfs_readdir);
+EXPORT_SYMBOL(__get_lease);
+EXPORT_SYMBOL(lease_get_mtime);
+EXPORT_SYMBOL(lock_may_read);
+EXPORT_SYMBOL(lock_may_write);
+EXPORT_SYMBOL(dcache_readdir);
+
+/* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
+EXPORT_SYMBOL(default_llseek);
+EXPORT_SYMBOL(dentry_open);
+EXPORT_SYMBOL(filemap_nopage);
+EXPORT_SYMBOL(filemap_sync);
+EXPORT_SYMBOL(lock_page);
+
+/* device registration */
+EXPORT_SYMBOL(register_chrdev);
+EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(register_blkdev);
+EXPORT_SYMBOL(unregister_blkdev);
+EXPORT_SYMBOL(tty_register_driver);
+EXPORT_SYMBOL(tty_unregister_driver);
+EXPORT_SYMBOL(tty_std_termios);
+
+/* block device driver support */
+EXPORT_SYMBOL(block_read);
+EXPORT_SYMBOL(block_write);
+EXPORT_SYMBOL(blksize_size);
+EXPORT_SYMBOL(hardsect_size);
+EXPORT_SYMBOL(blk_size);
+EXPORT_SYMBOL(blk_dev);
+EXPORT_SYMBOL(is_read_only);
+EXPORT_SYMBOL(set_device_ro);
+EXPORT_SYMBOL(bmap);
+EXPORT_SYMBOL(sync_dev);
+EXPORT_SYMBOL(devfs_register_partitions);
+EXPORT_SYMBOL(blkdev_open);
+EXPORT_SYMBOL(blkdev_get);
+EXPORT_SYMBOL(blkdev_put);
+EXPORT_SYMBOL(ioctl_by_bdev);
+EXPORT_SYMBOL(gendisk_head);
+EXPORT_SYMBOL(grok_partitions);
+EXPORT_SYMBOL(register_disk);
+EXPORT_SYMBOL(tq_disk);
+EXPORT_SYMBOL(init_buffer);
+EXPORT_SYMBOL(refile_buffer);
+EXPORT_SYMBOL(max_sectors);
+EXPORT_SYMBOL(max_readahead);
+EXPORT_SYMBOL(file_moveto);
+
+/* tty routines */
+EXPORT_SYMBOL(tty_hangup);
+EXPORT_SYMBOL(tty_wait_until_sent);
+EXPORT_SYMBOL(tty_check_change);
+EXPORT_SYMBOL(tty_hung_up_p);
+EXPORT_SYMBOL(tty_flip_buffer_push);
+EXPORT_SYMBOL(tty_get_baud_rate);
+EXPORT_SYMBOL(do_SAK);
+EXPORT_SYMBOL(console_print);
+EXPORT_SYMBOL(console_loglevel);
+
+/* filesystem registration */
+EXPORT_SYMBOL(register_filesystem);
+EXPORT_SYMBOL(unregister_filesystem);
+EXPORT_SYMBOL(kern_mount);
+EXPORT_SYMBOL(kern_umount);
+EXPORT_SYMBOL(may_umount);
+
+/* executable format registration */
+EXPORT_SYMBOL(register_binfmt);
+EXPORT_SYMBOL(unregister_binfmt);
+EXPORT_SYMBOL(search_binary_handler);
+EXPORT_SYMBOL(prepare_binprm);
+EXPORT_SYMBOL(compute_creds);
+EXPORT_SYMBOL(remove_arg_zero);
+EXPORT_SYMBOL(set_binfmt);
+
+/* execution environment registration */
+EXPORT_SYMBOL(register_exec_domain);
+EXPORT_SYMBOL(unregister_exec_domain);
+EXPORT_SYMBOL(__set_personality);
+
+/* sysctl table registration */
+EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(unregister_sysctl_table);
+EXPORT_SYMBOL(sysctl_string);
+EXPORT_SYMBOL(sysctl_intvec);
+EXPORT_SYMBOL(sysctl_jiffies);
+EXPORT_SYMBOL(proc_dostring);
+EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(proc_doulongvec_minmax);
+
+/* interrupt handling */
+EXPORT_SYMBOL(add_timer);
+EXPORT_SYMBOL(del_timer);
+EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(free_irq);
+#if !defined(CONFIG_ARCH_S390)
+EXPORT_SYMBOL(irq_stat); /* No separate irq_stat for s390, it is part of PSA */
+#endif
+
+/* waitqueue handling */
+EXPORT_SYMBOL(add_wait_queue);
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+EXPORT_SYMBOL(remove_wait_queue);
+
+/* The notion of irq probe/assignment is foreign to S/390 */
+
+#if !defined(CONFIG_ARCH_S390)
+EXPORT_SYMBOL(probe_irq_on);
+EXPORT_SYMBOL(probe_irq_off);
+#endif
+
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(del_timer_sync);
+#endif
+EXPORT_SYMBOL(mod_timer);
+EXPORT_SYMBOL(tq_timer);
+EXPORT_SYMBOL(tq_immediate);
+
+#ifdef CONFIG_SMP
+/* Various random spinlocks we want to export */
+EXPORT_SYMBOL(tqueue_lock);
+
+/* Big-Reader lock implementation */
+EXPORT_SYMBOL(__brlock_array);
+#ifndef __BRLOCK_USE_ATOMICS
+EXPORT_SYMBOL(__br_write_locks);
+#endif
+EXPORT_SYMBOL(__br_write_lock);
+EXPORT_SYMBOL(__br_write_unlock);
+#endif
+
+/* Kiobufs */
+EXPORT_SYMBOL(kiobuf_init);
+
+EXPORT_SYMBOL(alloc_kiovec);
+EXPORT_SYMBOL(free_kiovec);
+EXPORT_SYMBOL(expand_kiobuf);
+
+EXPORT_SYMBOL(map_user_kiobuf);
+EXPORT_SYMBOL(unmap_kiobuf);
+EXPORT_SYMBOL(lock_kiovec);
+EXPORT_SYMBOL(unlock_kiovec);
+EXPORT_SYMBOL(brw_kiovec);
+
+/* dma handling */
+EXPORT_SYMBOL(request_dma);
+EXPORT_SYMBOL(free_dma);
+EXPORT_SYMBOL(dma_spin_lock);
+#ifdef HAVE_DISABLE_HLT
+EXPORT_SYMBOL(disable_hlt);
+EXPORT_SYMBOL(enable_hlt);
+#endif
+
+/* resource handling */
+EXPORT_SYMBOL(request_resource);
+EXPORT_SYMBOL(release_resource);
+EXPORT_SYMBOL(allocate_resource);
+EXPORT_SYMBOL(check_resource);
+EXPORT_SYMBOL(__request_region);
+EXPORT_SYMBOL(__check_region);
+EXPORT_SYMBOL(__release_region);
+EXPORT_SYMBOL(ioport_resource);
+EXPORT_SYMBOL(iomem_resource);
+
+/* process management */
+EXPORT_SYMBOL(up_and_exit);
+EXPORT_SYMBOL(__wake_up);
+EXPORT_SYMBOL(wake_up_process);
+EXPORT_SYMBOL(sleep_on);
+EXPORT_SYMBOL(sleep_on_timeout);
+EXPORT_SYMBOL(interruptible_sleep_on);
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+EXPORT_SYMBOL(schedule);
+EXPORT_SYMBOL(schedule_timeout);
+EXPORT_SYMBOL(jiffies);
+EXPORT_SYMBOL(xtime);
+EXPORT_SYMBOL(do_gettimeofday);
+EXPORT_SYMBOL(do_settimeofday);
+
+#if !defined(__ia64__)
+EXPORT_SYMBOL(loops_per_jiffy);
+#endif
+
+EXPORT_SYMBOL(kstat);
+EXPORT_SYMBOL(nr_running);
+
+/* misc */
+EXPORT_SYMBOL(panic);
+EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(sprintf);
+EXPORT_SYMBOL(vsprintf);
+EXPORT_SYMBOL(kdevname);
+EXPORT_SYMBOL(bdevname);
+EXPORT_SYMBOL(cdevname);
+EXPORT_SYMBOL(simple_strtoul);
+EXPORT_SYMBOL(system_utsname); /* UTS data */
+EXPORT_SYMBOL(uts_sem); /* UTS semaphore */
+#ifndef __mips__
+EXPORT_SYMBOL(sys_call_table);
+#endif
+EXPORT_SYMBOL(machine_restart);
+EXPORT_SYMBOL(machine_halt);
+EXPORT_SYMBOL(machine_power_off);
+EXPORT_SYMBOL(_ctype);
+EXPORT_SYMBOL(secure_tcp_sequence_number);
+EXPORT_SYMBOL(get_random_bytes);
+EXPORT_SYMBOL(securebits);
+EXPORT_SYMBOL(cap_bset);
+EXPORT_SYMBOL(daemonize);
+
+/* Program loader interfaces */
+EXPORT_SYMBOL(setup_arg_pages);
+EXPORT_SYMBOL(copy_strings_kernel);
+EXPORT_SYMBOL(do_execve);
+EXPORT_SYMBOL(flush_old_exec);
+EXPORT_SYMBOL(kernel_read);
+EXPORT_SYMBOL(open_exec);
+
+/* Miscellaneous access points */
+EXPORT_SYMBOL(si_meminfo);
+
+/* Added to make file system as module */
+EXPORT_SYMBOL(sys_tz);
+EXPORT_SYMBOL(__wait_on_super);
+EXPORT_SYMBOL(file_fsync);
+EXPORT_SYMBOL(fsync_inode_buffers);
+EXPORT_SYMBOL(clear_inode);
+EXPORT_SYMBOL(nr_async_pages);
+EXPORT_SYMBOL(___strtok);
+EXPORT_SYMBOL(init_special_inode);
+EXPORT_SYMBOL(read_ahead);
+EXPORT_SYMBOL(get_hash_table);
+EXPORT_SYMBOL(get_empty_inode);
+EXPORT_SYMBOL(insert_inode_hash);
+EXPORT_SYMBOL(remove_inode_hash);
+EXPORT_SYMBOL(buffer_insert_inode_queue);
+EXPORT_SYMBOL(make_bad_inode);
+EXPORT_SYMBOL(is_bad_inode);
+EXPORT_SYMBOL(event);
+EXPORT_SYMBOL(brw_page);
+
+#ifdef CONFIG_UID16
+EXPORT_SYMBOL(overflowuid);
+EXPORT_SYMBOL(overflowgid);
+#endif
+EXPORT_SYMBOL(fs_overflowuid);
+EXPORT_SYMBOL(fs_overflowgid);
+
+/* all busmice */
+EXPORT_SYMBOL(fasync_helper);
+EXPORT_SYMBOL(kill_fasync);
+
+EXPORT_SYMBOL(disk_name); /* for md.c */
+
+/* binfmt_aout */
+EXPORT_SYMBOL(get_write_access);
+
+/* dynamic registering of consoles */
+EXPORT_SYMBOL(register_console);
+EXPORT_SYMBOL(unregister_console);
+
+/* time */
+EXPORT_SYMBOL(get_fast_time);
+
+/* library functions */
+EXPORT_SYMBOL(strnicmp);
+EXPORT_SYMBOL(strspn);
+EXPORT_SYMBOL(strsep);
+
+/* software interrupts */
+EXPORT_SYMBOL(tasklet_hi_vec);
+EXPORT_SYMBOL(tasklet_vec);
+EXPORT_SYMBOL(bh_task_vec);
+EXPORT_SYMBOL(init_bh);
+EXPORT_SYMBOL(remove_bh);
+EXPORT_SYMBOL(tasklet_init);
+EXPORT_SYMBOL(tasklet_kill);
+EXPORT_SYMBOL(__run_task_queue);
+
+/* init task, for moving kthread roots - ought to export a function ?? */
+
+EXPORT_SYMBOL(init_task_union);
+
+EXPORT_SYMBOL(tasklist_lock);
+EXPORT_SYMBOL(pidhash);
diff --git a/kernel/module.c b/kernel/module.c
new file mode 100644
index 000000000000..dd02b40cd891
--- /dev/null
+++ b/kernel/module.c
@@ -0,0 +1,1235 @@
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <asm/module.h>
+#include <asm/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/smp_lock.h>
+#include <asm/pgalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+
+/*
+ * Originally by Anonymous (as far as I know...)
+ * Linux version by Bas Laarhoven <bas@vimec.nl>
+ * 0.99.14 version by Jon Tombs <jon@gtex02.us.es>,
+ * Heavily modified by Bjorn Ekwall <bj0rn@blox.se> May 1994 (C)
+ * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
+ * Add MOD_INITIALIZING Keith Owens <kaos@ocs.com.au> Nov 1999
+ * Add kallsyms support, Keith Owens <kaos@ocs.com.au> Apr 2000
+ * Add asm/module support, IA64 has special requirements. Keith Owens <kaos@ocs.com.au> Sep 2000
+ * Fix assorted bugs in module verification. Keith Owens <kaos@ocs.com.au> Sep 2000
+ * Fix sys_init_module race, Andrew Morton <andrewm@uow.edu.au> Oct 2000
+ * http://www.uwsg.iu.edu/hypermail/linux/kernel/0008.3/0379.html
+ * Replace xxx_module_symbol with inter_module_xxx. Keith Owens <kaos@ocs.com.au> Oct 2000
+ *
+ * This source is covered by the GNU GPL, the same as all kernel sources.
+ */
+
+#if defined(CONFIG_MODULES) || defined(CONFIG_KALLSYMS)
+
+extern struct module_symbol __start___ksymtab[];
+extern struct module_symbol __stop___ksymtab[];
+
+extern const struct exception_table_entry __start___ex_table[];
+extern const struct exception_table_entry __stop___ex_table[];
+
+extern const char __start___kallsyms[] __attribute__ ((weak));
+extern const char __stop___kallsyms[] __attribute__ ((weak));
+
+static struct module kernel_module =
+{
+ size_of_struct: sizeof(struct module),
+ name: "",
+ uc: {ATOMIC_INIT(1)},
+ flags: MOD_RUNNING,
+ syms: __start___ksymtab,
+ ex_table_start: __start___ex_table,
+ ex_table_end: __stop___ex_table,
+ kallsyms_start: __start___kallsyms,
+ kallsyms_end: __stop___kallsyms,
+};
+
+struct module *module_list = &kernel_module;
+
+#endif /* defined(CONFIG_MODULES) || defined(CONFIG_KALLSYMS) */
+
+/* inter_module functions are always available, even when the kernel is
+ * compiled without modules. Consumers of inter_module_xxx routines
+ * will always work, even when both are built into the kernel, this
+ * approach removes lots of #ifdefs in mainline code.
+ */
+
+static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
+static spinlock_t ime_lock = SPIN_LOCK_UNLOCKED;
+static int kmalloc_failed;
+
+/**
+ * inter_module_register - register a new set of inter module data.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ * @owner: module that is registering the data, always use THIS_MODULE
+ * @userdata: pointer to arbitrary userdata to be registered
+ *
+ * Description: Check that the im_name has not already been registered,
+ * complain if it has. For new data, add it to the inter_module_entry
+ * list.
+ */
+void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime, *ime_new;
+
+ if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+ /* Overloaded kernel, not fatal */
+ printk(KERN_ERR
+ "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
+ im_name);
+ kmalloc_failed = 1;
+ return;
+ }
+ memset(ime_new, 0, sizeof(*ime_new));
+ ime_new->im_name = im_name;
+ ime_new->owner = owner;
+ ime_new->userdata = userdata;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ spin_unlock(&ime_lock);
+ kfree(ime_new);
+ /* Program logic error, fatal */
+ printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
+ BUG();
+ }
+ }
+ list_add(&(ime_new->list), &ime_list);
+ spin_unlock(&ime_lock);
+}
+
+/**
+ * inter_module_unregister - unregister a set of inter module data.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: Check that the im_name has been registered, complain if
+ * it has not. For existing data, remove it from the
+ * inter_module_entry list.
+ */
+void inter_module_unregister(const char *im_name)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ list_del(&(ime->list));
+ spin_unlock(&ime_lock);
+ kfree(ime);
+ return;
+ }
+ }
+ spin_unlock(&ime_lock);
+ if (kmalloc_failed) {
+ printk(KERN_ERR
+ "inter_module_unregister: no entry for '%s', "
+ "probably caused by previous kmalloc failure\n",
+ im_name);
+ return;
+ }
+ else {
+ /* Program logic error, fatal */
+ printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
+ BUG();
+ }
+}
+
+/**
+ * inter_module_get - return arbitrary userdata from another module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: If the im_name has not been registered, return NULL.
+ * Try to increment the use count on the owning module, if that fails
+ * then return NULL. Otherwise return the userdata.
+ */
+const void *inter_module_get(const char *im_name)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime;
+ const void *result = NULL;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ if (try_inc_mod_count(ime->owner))
+ result = ime->userdata;
+ break;
+ }
+ }
+ spin_unlock(&ime_lock);
+ return(result);
+}
+
+/**
+ * inter_module_get_request - im get with automatic request_module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ * @modname: module that is expected to register im_name
+ *
+ * Description: If inter_module_get fails, do request_module then retry.
+ */
+const void *inter_module_get_request(const char *im_name, const char *modname)
+{
+ const void *result = inter_module_get(im_name);
+ if (!result) {
+ request_module(modname);
+ result = inter_module_get(im_name);
+ }
+ return(result);
+}
+
+/**
+ * inter_module_put - release use of data from another module.
+ * @im_name: an arbitrary string to identify the data, must be unique
+ *
+ * Description: If the im_name has not been registered, complain,
+ * otherwise decrement the use count on the owning module.
+ */
+void inter_module_put(const char *im_name)
+{
+ struct list_head *tmp;
+ struct inter_module_entry *ime;
+
+ spin_lock(&ime_lock);
+ list_for_each(tmp, &ime_list) {
+ ime = list_entry(tmp, struct inter_module_entry, list);
+ if (strcmp(ime->im_name, im_name) == 0) {
+ if (ime->owner)
+ __MOD_DEC_USE_COUNT(ime->owner);
+ spin_unlock(&ime_lock);
+ return;
+ }
+ }
+ spin_unlock(&ime_lock);
+ printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
+ BUG();
+}
+
+
+#if defined(CONFIG_MODULES) /* The rest of the source */
+
+static long get_mod_name(const char *user_name, char **buf);
+static void put_mod_name(char *buf);
+struct module *find_module(const char *name);
+void free_module(struct module *, int tag_freed);
+
+
+/*
+ * Called at boot time
+ */
+
+void __init init_modules(void)
+{
+ kernel_module.nsyms = __stop___ksymtab - __start___ksymtab;
+
+#ifdef __alpha__
+ __asm__("stq $29,%0" : "=m"(kernel_module.gp));
+#endif
+}
+
+/*
+ * Copy the name of a module from user space.
+ */
+
+static inline long
+get_mod_name(const char *user_name, char **buf)
+{
+ unsigned long page;
+ long retval;
+
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ retval = strncpy_from_user((char *)page, user_name, PAGE_SIZE);
+ if (retval > 0) {
+ if (retval < PAGE_SIZE) {
+ *buf = (char *)page;
+ return retval;
+ }
+ retval = -ENAMETOOLONG;
+ } else if (!retval)
+ retval = -EINVAL;
+
+ free_page(page);
+ return retval;
+}
+
+static inline void
+put_mod_name(char *buf)
+{
+ free_page((unsigned long)buf);
+}
+
+/*
+ * Allocate space for a module.
+ */
+
+asmlinkage unsigned long
+sys_create_module(const char *name_user, size_t size)
+{
+ char *name;
+ long namelen, error;
+ struct module *mod;
+
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+ lock_kernel();
+ if ((namelen = get_mod_name(name_user, &name)) < 0) {
+ error = namelen;
+ goto err0;
+ }
+ if (size < sizeof(struct module)+namelen) {
+ error = -EINVAL;
+ goto err1;
+ }
+ if (find_module(name) != NULL) {
+ error = -EEXIST;
+ goto err1;
+ }
+ if ((mod = (struct module *)module_map(size)) == NULL) {
+ error = -ENOMEM;
+ goto err1;
+ }
+
+ memset(mod, 0, sizeof(*mod));
+ mod->size_of_struct = sizeof(*mod);
+ mod->next = module_list;
+ mod->name = (char *)(mod + 1);
+ mod->size = size;
+ memcpy((char*)(mod+1), name, namelen+1);
+
+ put_mod_name(name);
+
+ module_list = mod; /* link it in */
+
+ error = (long) mod;
+ goto err0;
+err1:
+ put_mod_name(name);
+err0:
+ unlock_kernel();
+ return error;
+}
+
+/*
+ * Initialize a module.
+ */
+
+asmlinkage long
+sys_init_module(const char *name_user, struct module *mod_user)
+{
+ struct module mod_tmp, *mod;
+ char *name, *n_name, *name_tmp = NULL;
+ long namelen, n_namelen, i, error;
+ unsigned long mod_user_size;
+ struct module_ref *dep;
+
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+ lock_kernel();
+ if ((namelen = get_mod_name(name_user, &name)) < 0) {
+ error = namelen;
+ goto err0;
+ }
+ if ((mod = find_module(name)) == NULL) {
+ error = -ENOENT;
+ goto err1;
+ }
+
+ /* Check module header size. We allow a bit of slop over the
+ size we are familiar with to cope with a version of insmod
+ for a newer kernel. But don't over do it. */
+ if ((error = get_user(mod_user_size, &mod_user->size_of_struct)) != 0)
+ goto err1;
+ if (mod_user_size < (unsigned long)&((struct module *)0L)->persist_start
+ || mod_user_size > sizeof(struct module) + 16*sizeof(void*)) {
+ printk(KERN_ERR "init_module: Invalid module header size.\n"
+ KERN_ERR "A new version of the modutils is likely "
+ "needed.\n");
+ error = -EINVAL;
+ goto err1;
+ }
+
+ /* Hold the current contents while we play with the user's idea
+ of righteousness. */
+ mod_tmp = *mod;
+ name_tmp = kmalloc(strlen(mod->name) + 1, GFP_KERNEL); /* Where's kstrdup()? */
+ if (name_tmp == NULL) {
+ error = -ENOMEM;
+ goto err1;
+ }
+ strcpy(name_tmp, mod->name);
+
+ error = copy_from_user(mod, mod_user, mod_user_size);
+ if (error) {
+ error = -EFAULT;
+ goto err2;
+ }
+
+ /* Sanity check the size of the module. */
+ error = -EINVAL;
+
+ if (mod->size > mod_tmp.size) {
+ printk(KERN_ERR "init_module: Size of initialized module "
+ "exceeds size of created module.\n");
+ goto err2;
+ }
+
+ /* Make sure all interesting pointers are sane. */
+
+ if (!mod_bound(mod->name, namelen, mod)) {
+ printk(KERN_ERR "init_module: mod->name out of bounds.\n");
+ goto err2;
+ }
+ if (mod->nsyms && !mod_bound(mod->syms, mod->nsyms, mod)) {
+ printk(KERN_ERR "init_module: mod->syms out of bounds.\n");
+ goto err2;
+ }
+ if (mod->ndeps && !mod_bound(mod->deps, mod->ndeps, mod)) {
+ printk(KERN_ERR "init_module: mod->deps out of bounds.\n");
+ goto err2;
+ }
+ if (mod->init && !mod_bound(mod->init, 0, mod)) {
+ printk(KERN_ERR "init_module: mod->init out of bounds.\n");
+ goto err2;
+ }
+ if (mod->cleanup && !mod_bound(mod->cleanup, 0, mod)) {
+ printk(KERN_ERR "init_module: mod->cleanup out of bounds.\n");
+ goto err2;
+ }
+ if (mod->ex_table_start > mod->ex_table_end
+ || (mod->ex_table_start &&
+ !((unsigned long)mod->ex_table_start >= ((unsigned long)mod + mod->size_of_struct)
+ && ((unsigned long)mod->ex_table_end
+ < (unsigned long)mod + mod->size)))
+ || (((unsigned long)mod->ex_table_start
+ - (unsigned long)mod->ex_table_end)
+ % sizeof(struct exception_table_entry))) {
+ printk(KERN_ERR "init_module: mod->ex_table_* invalid.\n");
+ goto err2;
+ }
+ if (mod->flags & ~MOD_AUTOCLEAN) {
+ printk(KERN_ERR "init_module: mod->flags invalid.\n");
+ goto err2;
+ }
+#ifdef __alpha__
+ if (!mod_bound(mod->gp - 0x8000, 0, mod)) {
+ printk(KERN_ERR "init_module: mod->gp out of bounds.\n");
+ goto err2;
+ }
+#endif
+ if (mod_member_present(mod, can_unload)
+ && mod->can_unload && !mod_bound(mod->can_unload, 0, mod)) {
+ printk(KERN_ERR "init_module: mod->can_unload out of bounds.\n");
+ goto err2;
+ }
+ if (mod_member_present(mod, kallsyms_end)) {
+ if (mod->kallsyms_end &&
+ (!mod_bound(mod->kallsyms_start, 0, mod) ||
+ !mod_bound(mod->kallsyms_end, 0, mod))) {
+ printk(KERN_ERR "init_module: mod->kallsyms out of bounds.\n");
+ goto err2;
+ }
+ if (mod->kallsyms_start > mod->kallsyms_end) {
+ printk(KERN_ERR "init_module: mod->kallsyms invalid.\n");
+ goto err2;
+ }
+ }
+ if (mod_member_present(mod, archdata_end)) {
+ if (mod->archdata_end &&
+ (!mod_bound(mod->archdata_start, 0, mod) ||
+ !mod_bound(mod->archdata_end, 0, mod))) {
+ printk(KERN_ERR "init_module: mod->archdata out of bounds.\n");
+ goto err2;
+ }
+ if (mod->archdata_start > mod->archdata_end) {
+ printk(KERN_ERR "init_module: mod->archdata invalid.\n");
+ goto err2;
+ }
+ }
+ if (mod_member_present(mod, kernel_data) && mod->kernel_data) {
+ printk(KERN_ERR "init_module: mod->kernel_data must be zero.\n");
+ goto err2;
+ }
+
+ /* Check that the user isn't doing something silly with the name. */
+
+ if ((n_namelen = get_mod_name(mod->name - (unsigned long)mod
+ + (unsigned long)mod_user,
+ &n_name)) < 0) {
+ printk(KERN_ERR "init_module: get_mod_name failure.\n");
+ error = n_namelen;
+ goto err2;
+ }
+ if (namelen != n_namelen || strcmp(n_name, mod_tmp.name) != 0) {
+ printk(KERN_ERR "init_module: changed module name to "
+ "`%s' from `%s'\n",
+ n_name, mod_tmp.name);
+ goto err3;
+ }
+
+ /* Ok, that's about all the sanity we can stomach; copy the rest. */
+
+ if (copy_from_user((char *)mod+mod_user_size,
+ (char *)mod_user+mod_user_size,
+ mod->size-mod_user_size)) {
+ error = -EFAULT;
+ goto err3;
+ }
+
+ if (module_arch_init(mod))
+ goto err3;
+
+ /* On some machines it is necessary to do something here
+ to make the I and D caches consistent. */
+ flush_icache_range((unsigned long)mod, (unsigned long)mod + mod->size);
+
+ mod->next = mod_tmp.next;
+ mod->refs = NULL;
+
+ /* Sanity check the module's dependents */
+ for (i = 0, dep = mod->deps; i < mod->ndeps; ++i, ++dep) {
+ struct module *o, *d = dep->dep;
+
+ /* Make sure the indicated dependencies are really modules. */
+ if (d == mod) {
+ printk(KERN_ERR "init_module: self-referential "
+ "dependency in mod->deps.\n");
+ goto err3;
+ }
+
+ /* Scan the current modules for this dependency */
+ for (o = module_list; o != &kernel_module && o != d; o = o->next)
+ ;
+
+ if (o != d) {
+ printk(KERN_ERR "init_module: found dependency that is "
+ "(no longer?) a module.\n");
+ goto err3;
+ }
+ }
+
+ /* Update module references. */
+ for (i = 0, dep = mod->deps; i < mod->ndeps; ++i, ++dep) {
+ struct module *d = dep->dep;
+
+ dep->ref = mod;
+ dep->next_ref = d->refs;
+ d->refs = dep;
+ /* Being referenced by a dependent module counts as a
+ use as far as kmod is concerned. */
+ d->flags |= MOD_USED_ONCE;
+ }
+
+ /* Free our temporary memory. */
+ put_mod_name(n_name);
+ put_mod_name(name);
+
+ /* Initialize the module. */
+ mod->flags |= MOD_INITIALIZING;
+ atomic_set(&mod->uc.usecount,1);
+ if (mod->init && (error = mod->init()) != 0) {
+ atomic_set(&mod->uc.usecount,0);
+ mod->flags &= ~MOD_INITIALIZING;
+ if (error > 0) /* Buggy module */
+ error = -EBUSY;
+ goto err0;
+ }
+ atomic_dec(&mod->uc.usecount);
+
+ /* And set it running. */
+ mod->flags = (mod->flags | MOD_RUNNING) & ~MOD_INITIALIZING;
+ error = 0;
+ goto err0;
+
+err3:
+ put_mod_name(n_name);
+err2:
+ *mod = mod_tmp;
+ strcpy((char *)mod->name, name_tmp); /* We know there is room for this */
+err1:
+ put_mod_name(name);
+err0:
+ unlock_kernel();
+ kfree(name_tmp);
+ return error;
+}
+
+static spinlock_t unload_lock = SPIN_LOCK_UNLOCKED;
+int try_inc_mod_count(struct module *mod)
+{
+ int res = 1;
+ if (mod) {
+ spin_lock(&unload_lock);
+ if (mod->flags & MOD_DELETED)
+ res = 0;
+ else
+ __MOD_INC_USE_COUNT(mod);
+ spin_unlock(&unload_lock);
+ }
+ return res;
+}
+
+asmlinkage long
+sys_delete_module(const char *name_user)
+{
+ struct module *mod, *next;
+ char *name;
+ long error;
+ int something_changed;
+
+ if (!capable(CAP_SYS_MODULE))
+ return -EPERM;
+
+ lock_kernel();
+ if (name_user) {
+ if ((error = get_mod_name(name_user, &name)) < 0)
+ goto out;
+ if (error == 0) {
+ error = -EINVAL;
+ put_mod_name(name);
+ goto out;
+ }
+ error = -ENOENT;
+ if ((mod = find_module(name)) == NULL) {
+ put_mod_name(name);
+ goto out;
+ }
+ put_mod_name(name);
+ error = -EBUSY;
+ if (mod->refs != NULL)
+ goto out;
+
+ spin_lock(&unload_lock);
+ if (!__MOD_IN_USE(mod)) {
+ mod->flags |= MOD_DELETED;
+ spin_unlock(&unload_lock);
+ free_module(mod, 0);
+ error = 0;
+ } else {
+ spin_unlock(&unload_lock);
+ }
+ goto out;
+ }
+
+ /* Do automatic reaping */
+restart:
+ something_changed = 0;
+ for (mod = module_list; mod != &kernel_module; mod = next) {
+ next = mod->next;
+ spin_lock(&unload_lock);
+ if (mod->refs == NULL
+ && (mod->flags & MOD_AUTOCLEAN)
+ && (mod->flags & MOD_RUNNING)
+ && !(mod->flags & MOD_DELETED)
+ && (mod->flags & MOD_USED_ONCE)
+ && !__MOD_IN_USE(mod)) {
+ if ((mod->flags & MOD_VISITED)
+ && !(mod->flags & MOD_JUST_FREED)) {
+ spin_unlock(&unload_lock);
+ mod->flags &= ~MOD_VISITED;
+ } else {
+ mod->flags |= MOD_DELETED;
+ spin_unlock(&unload_lock);
+ free_module(mod, 1);
+ something_changed = 1;
+ }
+ } else {
+ spin_unlock(&unload_lock);
+ }
+ }
+ if (something_changed)
+ goto restart;
+ for (mod = module_list; mod != &kernel_module; mod = mod->next)
+ mod->flags &= ~MOD_JUST_FREED;
+ error = 0;
+out:
+ unlock_kernel();
+ return error;
+}
+
+/* Query various bits about modules. */
+
+static int
+qm_modules(char *buf, size_t bufsize, size_t *ret)
+{
+ struct module *mod;
+ size_t nmod, space, len;
+
+ nmod = space = 0;
+
+ for (mod=module_list; mod != &kernel_module; mod=mod->next, ++nmod) {
+ len = strlen(mod->name)+1;
+ if (len > bufsize)
+ goto calc_space_needed;
+ if (copy_to_user(buf, mod->name, len))
+ return -EFAULT;
+ buf += len;
+ bufsize -= len;
+ space += len;
+ }
+
+ if (put_user(nmod, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+calc_space_needed:
+ space += len;
+ while ((mod = mod->next) != &kernel_module)
+ space += strlen(mod->name)+1;
+
+ if (put_user(space, ret))
+ return -EFAULT;
+ else
+ return -ENOSPC;
+}
+
+static int
+qm_deps(struct module *mod, char *buf, size_t bufsize, size_t *ret)
+{
+ size_t i, space, len;
+
+ if (mod == &kernel_module)
+ return -EINVAL;
+ if (!MOD_CAN_QUERY(mod))
+ if (put_user(0, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+ space = 0;
+ for (i = 0; i < mod->ndeps; ++i) {
+ const char *dep_name = mod->deps[i].dep->name;
+
+ len = strlen(dep_name)+1;
+ if (len > bufsize)
+ goto calc_space_needed;
+ if (copy_to_user(buf, dep_name, len))
+ return -EFAULT;
+ buf += len;
+ bufsize -= len;
+ space += len;
+ }
+
+ if (put_user(i, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+calc_space_needed:
+ space += len;
+ while (++i < mod->ndeps)
+ space += strlen(mod->deps[i].dep->name)+1;
+
+ if (put_user(space, ret))
+ return -EFAULT;
+ else
+ return -ENOSPC;
+}
+
+static int
+qm_refs(struct module *mod, char *buf, size_t bufsize, size_t *ret)
+{
+ size_t nrefs, space, len;
+ struct module_ref *ref;
+
+ if (mod == &kernel_module)
+ return -EINVAL;
+ if (!MOD_CAN_QUERY(mod))
+ if (put_user(0, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+ space = 0;
+ for (nrefs = 0, ref = mod->refs; ref ; ++nrefs, ref = ref->next_ref) {
+ const char *ref_name = ref->ref->name;
+
+ len = strlen(ref_name)+1;
+ if (len > bufsize)
+ goto calc_space_needed;
+ if (copy_to_user(buf, ref_name, len))
+ return -EFAULT;
+ buf += len;
+ bufsize -= len;
+ space += len;
+ }
+
+ if (put_user(nrefs, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+calc_space_needed:
+ space += len;
+ while ((ref = ref->next_ref) != NULL)
+ space += strlen(ref->ref->name)+1;
+
+ if (put_user(space, ret))
+ return -EFAULT;
+ else
+ return -ENOSPC;
+}
+
+static int
+qm_symbols(struct module *mod, char *buf, size_t bufsize, size_t *ret)
+{
+ size_t i, space, len;
+ struct module_symbol *s;
+ char *strings;
+ unsigned long *vals;
+
+ if (!MOD_CAN_QUERY(mod))
+ if (put_user(0, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+ space = mod->nsyms * 2*sizeof(void *);
+
+ i = len = 0;
+ s = mod->syms;
+
+ if (space > bufsize)
+ goto calc_space_needed;
+
+ if (!access_ok(VERIFY_WRITE, buf, space))
+ return -EFAULT;
+
+ bufsize -= space;
+ vals = (unsigned long *)buf;
+ strings = buf+space;
+
+ for (; i < mod->nsyms ; ++i, ++s, vals += 2) {
+ len = strlen(s->name)+1;
+ if (len > bufsize)
+ goto calc_space_needed;
+
+ if (copy_to_user(strings, s->name, len)
+ || __put_user(s->value, vals+0)
+ || __put_user(space, vals+1))
+ return -EFAULT;
+
+ strings += len;
+ bufsize -= len;
+ space += len;
+ }
+
+ if (put_user(i, ret))
+ return -EFAULT;
+ else
+ return 0;
+
+calc_space_needed:
+ for (; i < mod->nsyms; ++i, ++s)
+ space += strlen(s->name)+1;
+
+ if (put_user(space, ret))
+ return -EFAULT;
+ else
+ return -ENOSPC;
+}
+
+static int
+qm_info(struct module *mod, char *buf, size_t bufsize, size_t *ret)
+{
+ int error = 0;
+
+ if (mod == &kernel_module)
+ return -EINVAL;
+
+ if (sizeof(struct module_info) <= bufsize) {
+ struct module_info info;
+ info.addr = (unsigned long)mod;
+ info.size = mod->size;
+ info.flags = mod->flags;
+ info.usecount = (mod_member_present(mod, can_unload)
+ && mod->can_unload ? -1 : atomic_read(&mod->uc.usecount));
+
+ if (copy_to_user(buf, &info, sizeof(struct module_info)))
+ return -EFAULT;
+ } else
+ error = -ENOSPC;
+
+ if (put_user(sizeof(struct module_info), ret))
+ return -EFAULT;
+
+ return error;
+}
+
+asmlinkage long
+sys_query_module(const char *name_user, int which, char *buf, size_t bufsize,
+ size_t *ret)
+{
+ struct module *mod;
+ int err;
+
+ lock_kernel();
+ if (name_user == NULL)
+ mod = &kernel_module;
+ else {
+ long namelen;
+ char *name;
+
+ if ((namelen = get_mod_name(name_user, &name)) < 0) {
+ err = namelen;
+ goto out;
+ }
+ err = -ENOENT;
+ if (namelen == 0)
+ mod = &kernel_module;
+ else if ((mod = find_module(name)) == NULL) {
+ put_mod_name(name);
+ goto out;
+ }
+ put_mod_name(name);
+ }
+
+ switch (which)
+ {
+ case 0:
+ err = 0;
+ break;
+ case QM_MODULES:
+ err = qm_modules(buf, bufsize, ret);
+ break;
+ case QM_DEPS:
+ err = qm_deps(mod, buf, bufsize, ret);
+ break;
+ case QM_REFS:
+ err = qm_refs(mod, buf, bufsize, ret);
+ break;
+ case QM_SYMBOLS:
+ err = qm_symbols(mod, buf, bufsize, ret);
+ break;
+ case QM_INFO:
+ err = qm_info(mod, buf, bufsize, ret);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+out:
+ unlock_kernel();
+ return err;
+}
+
+/*
+ * Copy the kernel symbol table to user space. If the argument is
+ * NULL, just return the size of the table.
+ *
+ * This call is obsolete. New programs should use query_module+QM_SYMBOLS
+ * which does not arbitrarily limit the length of symbols.
+ */
+
+asmlinkage long
+sys_get_kernel_syms(struct kernel_sym *table)
+{
+ struct module *mod;
+ int i;
+ struct kernel_sym ksym;
+
+ lock_kernel();
+ for (mod = module_list, i = 0; mod; mod = mod->next) {
+ /* include the count for the module name! */
+ i += mod->nsyms + 1;
+ }
+
+ if (table == NULL)
+ goto out;
+
+ /* So that we don't give the user our stack content */
+ memset (&ksym, 0, sizeof (ksym));
+
+ for (mod = module_list, i = 0; mod; mod = mod->next) {
+ struct module_symbol *msym;
+ unsigned int j;
+
+ if (!MOD_CAN_QUERY(mod))
+ continue;
+
+ /* magic: write module info as a pseudo symbol */
+ ksym.value = (unsigned long)mod;
+ ksym.name[0] = '#';
+ strncpy(ksym.name+1, mod->name, sizeof(ksym.name)-1);
+ ksym.name[sizeof(ksym.name)-1] = '\0';
+
+ if (copy_to_user(table, &ksym, sizeof(ksym)) != 0)
+ goto out;
+ ++i, ++table;
+
+ if (mod->nsyms == 0)
+ continue;
+
+ for (j = 0, msym = mod->syms; j < mod->nsyms; ++j, ++msym) {
+ ksym.value = msym->value;
+ strncpy(ksym.name, msym->name, sizeof(ksym.name));
+ ksym.name[sizeof(ksym.name)-1] = '\0';
+
+ if (copy_to_user(table, &ksym, sizeof(ksym)) != 0)
+ goto out;
+ ++i, ++table;
+ }
+ }
+out:
+ unlock_kernel();
+ return i;
+}
+
+/*
+ * Look for a module by name, ignoring modules marked for deletion.
+ */
+
+struct module *
+find_module(const char *name)
+{
+ struct module *mod;
+
+ for (mod = module_list; mod ; mod = mod->next) {
+ if (mod->flags & MOD_DELETED)
+ continue;
+ if (!strcmp(mod->name, name))
+ break;
+ }
+
+ return mod;
+}
+
+/*
+ * Free the given module.
+ */
+
+void
+free_module(struct module *mod, int tag_freed)
+{
+ struct module_ref *dep;
+ unsigned i;
+
+ /* Let the module clean up. */
+
+ if (mod->flags & MOD_RUNNING)
+ {
+ if(mod->cleanup)
+ mod->cleanup();
+ mod->flags &= ~MOD_RUNNING;
+ }
+
+ /* Remove the module from the dependency lists. */
+
+ for (i = 0, dep = mod->deps; i < mod->ndeps; ++i, ++dep) {
+ struct module_ref **pp;
+ for (pp = &dep->dep->refs; *pp != dep; pp = &(*pp)->next_ref)
+ continue;
+ *pp = dep->next_ref;
+ if (tag_freed && dep->dep->refs == NULL)
+ dep->dep->flags |= MOD_JUST_FREED;
+ }
+
+ /* And from the main module list. */
+
+ if (mod == module_list) {
+ module_list = mod->next;
+ } else {
+ struct module *p;
+ for (p = module_list; p->next != mod; p = p->next)
+ continue;
+ p->next = mod->next;
+ }
+
+ /* And free the memory. */
+
+ module_unmap(mod);
+}
+
+/*
+ * Called by the /proc file system to return a current list of modules.
+ */
+
+int get_module_list(char *p)
+{
+ size_t left = PAGE_SIZE;
+ struct module *mod;
+ char tmpstr[64];
+ struct module_ref *ref;
+
+ for (mod = module_list; mod != &kernel_module; mod = mod->next) {
+ long len;
+ const char *q;
+
+#define safe_copy_str(str, len) \
+ do { \
+ if (left < len) \
+ goto fini; \
+ memcpy(p, str, len); p += len, left -= len; \
+ } while (0)
+#define safe_copy_cstr(str) safe_copy_str(str, sizeof(str)-1)
+
+ len = strlen(mod->name);
+ safe_copy_str(mod->name, len);
+
+ if ((len = 20 - len) > 0) {
+ if (left < len)
+ goto fini;
+ memset(p, ' ', len);
+ p += len;
+ left -= len;
+ }
+
+ len = sprintf(tmpstr, "%8lu", mod->size);
+ safe_copy_str(tmpstr, len);
+
+ if (mod->flags & MOD_RUNNING) {
+ len = sprintf(tmpstr, "%4ld",
+ (mod_member_present(mod, can_unload)
+ && mod->can_unload
+ ? -1L : (long)atomic_read(&mod->uc.usecount)));
+ safe_copy_str(tmpstr, len);
+ }
+
+ if (mod->flags & MOD_DELETED)
+ safe_copy_cstr(" (deleted)");
+ else if (mod->flags & MOD_RUNNING) {
+ if (mod->flags & MOD_AUTOCLEAN)
+ safe_copy_cstr(" (autoclean)");
+ if (!(mod->flags & MOD_USED_ONCE))
+ safe_copy_cstr(" (unused)");
+ }
+ else if (mod->flags & MOD_INITIALIZING)
+ safe_copy_cstr(" (initializing)");
+ else
+ safe_copy_cstr(" (uninitialized)");
+
+ if ((ref = mod->refs) != NULL) {
+ safe_copy_cstr(" [");
+ while (1) {
+ q = ref->ref->name;
+ len = strlen(q);
+ safe_copy_str(q, len);
+
+ if ((ref = ref->next_ref) != NULL)
+ safe_copy_cstr(" ");
+ else
+ break;
+ }
+ safe_copy_cstr("]");
+ }
+ safe_copy_cstr("\n");
+
+#undef safe_copy_str
+#undef safe_copy_cstr
+ }
+
+fini:
+ return PAGE_SIZE - left;
+}
+
+/*
+ * Called by the /proc file system to return a current list of ksyms.
+ */
+
+int
+get_ksyms_list(char *buf, char **start, off_t offset, int length)
+{
+ struct module *mod;
+ char *p = buf;
+ int len = 0; /* code from net/ipv4/proc.c */
+ off_t pos = 0;
+ off_t begin = 0;
+
+ for (mod = module_list; mod; mod = mod->next) {
+ unsigned i;
+ struct module_symbol *sym;
+
+ if (!MOD_CAN_QUERY(mod))
+ continue;
+
+ for (i = mod->nsyms, sym = mod->syms; i > 0; --i, ++sym) {
+ p = buf + len;
+ if (*mod->name) {
+ len += sprintf(p, "%0*lx %s\t[%s]\n",
+ (int)(2*sizeof(void*)),
+ sym->value, sym->name,
+ mod->name);
+ } else {
+ len += sprintf(p, "%0*lx %s\n",
+ (int)(2*sizeof(void*)),
+ sym->value, sym->name);
+ }
+ pos = begin + len;
+ if (pos < offset) {
+ len = 0;
+ begin = pos;
+ }
+ pos = begin + len;
+ if (pos > offset+length)
+ goto leave_the_loop;
+ }
+ }
+leave_the_loop:
+ *start = buf + (offset - begin);
+ len -= (offset - begin);
+ if (len > length)
+ len = length;
+ return len;
+}
+
+#else /* CONFIG_MODULES */
+
+/* Dummy syscalls for people who don't want modules */
+
+asmlinkage unsigned long
+sys_create_module(const char *name_user, size_t size)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long
+sys_init_module(const char *name_user, struct module *mod_user)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long
+sys_delete_module(const char *name_user)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long
+sys_query_module(const char *name_user, int which, char *buf, size_t bufsize,
+ size_t *ret)
+{
+ /* Let the program know about the new interface. Not that
+ it'll do them much good. */
+ if (which == 0)
+ return 0;
+
+ return -ENOSYS;
+}
+
+asmlinkage long
+sys_get_kernel_syms(struct kernel_sym *table)
+{
+ return -ENOSYS;
+}
+
+int try_inc_mod_count(struct module *mod)
+{
+ return 1;
+}
+
+#endif /* CONFIG_MODULES */
diff --git a/kernel/panic.c b/kernel/panic.c
new file mode 100644
index 000000000000..ac246f74589a
--- /dev/null
+++ b/kernel/panic.c
@@ -0,0 +1,103 @@
+/*
+ * linux/kernel/panic.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * This function is used through-out the kernel (including mm and fs)
+ * to indicate a major problem.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+#include <linux/interrupt.h>
+
+asmlinkage void sys_sync(void); /* it's really int */
+extern void unblank_console(void);
+
+int panic_timeout;
+
+struct notifier_block *panic_notifier_list;
+
+static int __init panic_setup(char *str)
+{
+ panic_timeout = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+
+__setup("panic=", panic_setup);
+
+/**
+ * panic - halt the system
+ * @fmt: The text string to print
+ *
+ * Display a message, then unblank the console and perform
+ * cleanups. Functions in the panic notifier list are called
+ * after the filesystem cache is flushed (when possible).
+ *
+ * This function never returns.
+ */
+
+NORET_TYPE void panic(const char * fmt, ...)
+{
+ static char buf[1024];
+ va_list args;
+#if defined(CONFIG_ARCH_S390)
+ unsigned long caller = (unsigned long) __builtin_return_address(0);
+#endif
+
+ va_start(args, fmt);
+ vsprintf(buf, fmt, args);
+ va_end(args);
+ printk(KERN_EMERG "Kernel panic: %s\n",buf);
+ if (in_interrupt())
+ printk(KERN_EMERG "In interrupt handler - not syncing\n");
+ else if (!current->pid)
+ printk(KERN_EMERG "In idle task - not syncing\n");
+ else
+ sys_sync();
+
+ unblank_console();
+
+#ifdef CONFIG_SMP
+ smp_send_stop();
+#endif
+
+ notifier_call_chain(&panic_notifier_list, 0, NULL);
+
+ if (panic_timeout > 0)
+ {
+ /*
+ * Delay timeout seconds before rebooting the machine.
+ * We can't use the "normal" timers since we just panicked..
+ */
+ printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+ mdelay(panic_timeout*1000);
+ /*
+ * Should we run the reboot notifier. For the moment Im
+ * choosing not too. It might crash, be corrupt or do
+ * more harm than good for other reasons.
+ */
+ machine_restart(NULL);
+ }
+#ifdef __sparc__
+ {
+ extern int stop_a_enabled;
+ /* Make sure the user can actually press L1-A */
+ stop_a_enabled = 1;
+ printk("Press L1-A to return to the boot prom\n");
+ }
+#endif
+#if defined(CONFIG_ARCH_S390)
+ disabled_wait(caller);
+#endif
+ sti();
+ for(;;) {
+ CHECK_EMERGENCY_SYNC
+ }
+}
diff --git a/kernel/pm.c b/kernel/pm.c
new file mode 100644
index 000000000000..eb7c6f615bd8
--- /dev/null
+++ b/kernel/pm.c
@@ -0,0 +1,245 @@
+/*
+ * pm.c - Power management interface
+ *
+ * Copyright (C) 2000 Andrew Henroid
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+
+int pm_active;
+
+static spinlock_t pm_devs_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(pm_devs);
+
+/**
+ * pm_register - register a device with power management
+ * @type: device type
+ * @id: device ID
+ * @callback: callback function
+ *
+ * Add a device to the list of devices that wish to be notified about
+ * power management events. A &pm_dev structure is returned on success,
+ * on failure the return is %NULL.
+ */
+
+struct pm_dev *pm_register(pm_dev_t type,
+ unsigned long id,
+ pm_callback callback)
+{
+ struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+ if (dev) {
+ unsigned long flags;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->type = type;
+ dev->id = id;
+ dev->callback = callback;
+
+ spin_lock_irqsave(&pm_devs_lock, flags);
+ list_add(&dev->entry, &pm_devs);
+ spin_unlock_irqrestore(&pm_devs_lock, flags);
+ }
+ return dev;
+}
+
+/**
+ * pm_unregister - unregister a device with power management
+ * @dev: device to unregister
+ *
+ * Remove a device from the power management notification lists. The
+ * dev passed must be a handle previously returned by pm_register.
+ */
+
+void pm_unregister(struct pm_dev *dev)
+{
+ if (dev) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pm_devs_lock, flags);
+ list_del(&dev->entry);
+ spin_unlock_irqrestore(&pm_devs_lock, flags);
+
+ kfree(dev);
+ }
+}
+
+/**
+ * pm_unregister_all - unregister all devices with matching callback
+ * @callback: callback function pointer
+ *
+ * Unregister every device that would call the callback passed. This
+ * is primarily meant as a helper function for loadable modules. It
+ * enables a module to give up all its managed devices without keeping
+ * its own private list.
+ */
+
+void pm_unregister_all(pm_callback callback)
+{
+ struct list_head *entry;
+
+ if (!callback)
+ return;
+
+ entry = pm_devs.next;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ entry = entry->next;
+ if (dev->callback == callback)
+ pm_unregister(dev);
+ }
+}
+
+/**
+ * pm_send - send request to a single device
+ * @dev: device to send to
+ * @rqst: power management request
+ * @data: data for the callback
+ *
+ * Issue a power management request to a given device. The
+ * %PM_SUSPEND and %PM_RESUME events are handled specially. The
+ * data field must hold the intended next state. No call is made
+ * if the state matches.
+ *
+ * BUGS: what stops two power management requests occuring in parallel
+ * and conflicting.
+ */
+
+int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+ int status = 0;
+ int prev_state, next_state;
+ switch (rqst) {
+ case PM_SUSPEND:
+ case PM_RESUME:
+ prev_state = dev->state;
+ next_state = (int) data;
+ if (prev_state != next_state) {
+ if (dev->callback)
+ status = (*dev->callback)(dev, rqst, data);
+ if (!status) {
+ dev->state = next_state;
+ dev->prev_state = prev_state;
+ }
+ }
+ else {
+ dev->prev_state = prev_state;
+ }
+ break;
+ default:
+ if (dev->callback)
+ status = (*dev->callback)(dev, rqst, data);
+ break;
+ }
+ return status;
+}
+
+/*
+ * Undo incomplete request
+ */
+static void pm_undo_all(struct pm_dev *last)
+{
+ struct list_head *entry = last->entry.prev;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ if (dev->state != dev->prev_state) {
+ /* previous state was zero (running) resume or
+ * previous state was non-zero (suspended) suspend
+ */
+ pm_request_t undo = (dev->prev_state
+ ? PM_SUSPEND:PM_RESUME);
+ pm_send(dev, undo, (void*) dev->prev_state);
+ }
+ entry = entry->prev;
+ }
+}
+
+/**
+ * pm_send_all - send request to all managed devices
+ * @rqst: power management request
+ * @data: data for the callback
+ *
+ * Issue a power management request to a all devices. The
+ * %PM_SUSPEND events are handled specially. Any device is
+ * permitted to fail a suspend by returning a non zero (error)
+ * value from its callback function. If any device vetoes a
+ * suspend request then all other devices that have suspended
+ * during the processing of this request are restored to their
+ * previous state.
+ *
+ * Zero is returned on success. If a suspend fails then the status
+ * from the device that vetoes the suspend is returned.
+ *
+ * BUGS: what stops two power management requests occuring in parallel
+ * and conflicting.
+ */
+
+int pm_send_all(pm_request_t rqst, void *data)
+{
+ struct list_head *entry = pm_devs.next;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ if (dev->callback) {
+ int status = pm_send(dev, rqst, data);
+ if (status) {
+ /* return devices to previous state on
+ * failed suspend request
+ */
+ if (rqst == PM_SUSPEND)
+ pm_undo_all(dev);
+ return status;
+ }
+ }
+ entry = entry->next;
+ }
+ return 0;
+}
+
+/**
+ * pm_find - find a device
+ * @type: type of device
+ * @from: where to start looking
+ *
+ * Scan the power management list for devices of a specific type. The
+ * return value for a matching device may be passed to further calls
+ * to this function to find further matches. A %NULL indicates the end
+ * of the list.
+ *
+ * To search from the beginning pass %NULL as the @from value.
+ */
+
+struct pm_dev *pm_find(pm_dev_t type, struct pm_dev *from)
+{
+ struct list_head *entry = from ? from->entry.next:pm_devs.next;
+ while (entry != &pm_devs) {
+ struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
+ if (type == PM_UNKNOWN_DEV || dev->type == type)
+ return dev;
+ entry = entry->next;
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(pm_register);
+EXPORT_SYMBOL(pm_unregister);
+EXPORT_SYMBOL(pm_unregister_all);
+EXPORT_SYMBOL(pm_send);
+EXPORT_SYMBOL(pm_send_all);
+EXPORT_SYMBOL(pm_find);
+EXPORT_SYMBOL(pm_active);
diff --git a/kernel/printk.c b/kernel/printk.c
new file mode 100644
index 000000000000..4a459b6051d8
--- /dev/null
+++ b/kernel/printk.c
@@ -0,0 +1,497 @@
+/*
+ * linux/kernel/printk.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Modified to make sys_syslog() more flexible: added commands to
+ * return the last 4k of kernel messages, regardless of whether
+ * they've been read or not. Added option to suppress kernel printk's
+ * to the console. Added hook for sending the console messages
+ * elsewhere, in preparation for a serial line console (someday).
+ * Ted Ts'o, 2/11/93.
+ * Modified for sysctl support, 1/8/97, Chris Horn.
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
+ * manfreds@colorfullife.com
+ */
+
+#include <linux/mm.h>
+#include <linux/tty_driver.h>
+#include <linux/smp_lock.h>
+#include <linux/console.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+
+#define LOG_BUF_LEN (16384)
+#define LOG_BUF_MASK (LOG_BUF_LEN-1)
+
+static char buf[1024];
+
+/* printk's without a loglevel use this.. */
+#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+
+/* We show everything that is MORE important than this.. */
+#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
+#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
+
+unsigned long log_size;
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
+
+/* Keep together for sysctl support */
+int console_loglevel = DEFAULT_CONSOLE_LOGLEVEL;
+int default_message_loglevel = DEFAULT_MESSAGE_LOGLEVEL;
+int minimum_console_loglevel = MINIMUM_CONSOLE_LOGLEVEL;
+int default_console_loglevel = DEFAULT_CONSOLE_LOGLEVEL;
+
+spinlock_t console_lock = SPIN_LOCK_UNLOCKED;
+
+struct console *console_drivers;
+static char log_buf[LOG_BUF_LEN];
+static unsigned long log_start;
+static unsigned long logged_chars;
+struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
+static int preferred_console = -1;
+
+/*
+ * Setup a list of consoles. Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+ struct console_cmdline *c;
+ char name[sizeof(c->name)];
+ char *s, *options;
+ int i, idx;
+
+ /*
+ * Decode str into name, index, options.
+ */
+ if (str[0] >= '0' && str[0] <= '9') {
+ strcpy(name, "ttyS");
+ strncpy(name + 4, str, sizeof(name) - 5);
+ } else
+ strncpy(name, str, sizeof(name) - 1);
+ name[sizeof(name) - 1] = 0;
+ if ((options = strchr(str, ',')) != NULL)
+ *(options++) = 0;
+#ifdef __sparc__
+ if (!strcmp(str, "ttya"))
+ strcpy(name, "ttyS0");
+ if (!strcmp(str, "ttyb"))
+ strcpy(name, "ttyS1");
+#endif
+ for(s = name; *s; s++)
+ if (*s >= '0' && *s <= '9')
+ break;
+ idx = simple_strtoul(s, NULL, 10);
+ *s = 0;
+
+ /*
+ * See if this tty is not yet registered, and
+ * if we have a slot free.
+ */
+ for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+ if (strcmp(console_cmdline[i].name, name) == 0 &&
+ console_cmdline[i].index == idx) {
+ preferred_console = i;
+ return 1;
+ }
+ if (i == MAX_CMDLINECONSOLES)
+ return 1;
+ preferred_console = i;
+ c = &console_cmdline[i];
+ memcpy(c->name, name, sizeof(c->name));
+ c->options = options;
+ c->index = idx;
+ return 1;
+}
+
+__setup("console=", console_setup);
+
+/*
+ * Commands to do_syslog:
+ *
+ * 0 -- Close the log. Currently a NOP.
+ * 1 -- Open the log. Currently a NOP.
+ * 2 -- Read from the log.
+ * 3 -- Read all messages remaining in the ring buffer.
+ * 4 -- Read and clear all messages remaining in the ring buffer
+ * 5 -- Clear ring buffer.
+ * 6 -- Disable printk's to console
+ * 7 -- Enable printk's to console
+ * 8 -- Set level of messages printed to console
+ */
+int do_syslog(int type, char * buf, int len)
+{
+ unsigned long i, j, limit, count;
+ int do_clear = 0;
+ char c;
+ int error = -EPERM;
+
+ error = 0;
+ switch (type) {
+ case 0: /* Close log */
+ break;
+ case 1: /* Open log */
+ break;
+ case 2: /* Read from log */
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ error = verify_area(VERIFY_WRITE,buf,len);
+ if (error)
+ goto out;
+ error = wait_event_interruptible(log_wait, log_size);
+ if (error)
+ goto out;
+ i = 0;
+ spin_lock_irq(&console_lock);
+ while (log_size && i < len) {
+ c = log_buf[log_start & LOG_BUF_MASK];
+ log_start++;
+ log_size--;
+ spin_unlock_irq(&console_lock);
+ __put_user(c,buf);
+ buf++;
+ i++;
+ spin_lock_irq(&console_lock);
+ }
+ spin_unlock_irq(&console_lock);
+ error = i;
+ break;
+ case 4: /* Read/clear last kernel messages */
+ do_clear = 1;
+ /* FALL THRU */
+ case 3: /* Read last kernel messages */
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ error = verify_area(VERIFY_WRITE,buf,len);
+ if (error)
+ goto out;
+ count = len;
+ if (count > LOG_BUF_LEN)
+ count = LOG_BUF_LEN;
+ spin_lock_irq(&console_lock);
+ if (count > logged_chars)
+ count = logged_chars;
+ if (do_clear)
+ logged_chars = 0;
+ limit = log_start + log_size;
+ /*
+ * __put_user() could sleep, and while we sleep
+ * printk() could overwrite the messages
+ * we try to copy to user space. Therefore
+ * the messages are copied in reverse. <manfreds>
+ */
+ for(i=0;i < count;i++) {
+ j = limit-1-i;
+ if (j+LOG_BUF_LEN < log_start+log_size)
+ break;
+ c = log_buf[ j & LOG_BUF_MASK ];
+ spin_unlock_irq(&console_lock);
+ __put_user(c,&buf[count-1-i]);
+ spin_lock_irq(&console_lock);
+ }
+ spin_unlock_irq(&console_lock);
+ error = i;
+ if(i != count) {
+ int offset = count-error;
+ /* buffer overflow during copy, correct user buffer. */
+ for(i=0;i<error;i++) {
+ __get_user(c,&buf[i+offset]);
+ __put_user(c,&buf[i]);
+ }
+ }
+
+ break;
+ case 5: /* Clear ring buffer */
+ spin_lock_irq(&console_lock);
+ logged_chars = 0;
+ spin_unlock_irq(&console_lock);
+ break;
+ case 6: /* Disable logging to console */
+ spin_lock_irq(&console_lock);
+ console_loglevel = minimum_console_loglevel;
+ spin_unlock_irq(&console_lock);
+ break;
+ case 7: /* Enable logging to console */
+ spin_lock_irq(&console_lock);
+ console_loglevel = default_console_loglevel;
+ spin_unlock_irq(&console_lock);
+ break;
+ case 8:
+ error = -EINVAL;
+ if (len < 1 || len > 8)
+ goto out;
+ if (len < minimum_console_loglevel)
+ len = minimum_console_loglevel;
+ spin_lock_irq(&console_lock);
+ console_loglevel = len;
+ spin_unlock_irq(&console_lock);
+ error = 0;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+out:
+ return error;
+}
+
+asmlinkage long sys_syslog(int type, char * buf, int len)
+{
+ if ((type != 3) && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return do_syslog(type, buf, len);
+}
+
+asmlinkage int printk(const char *fmt, ...)
+{
+ va_list args;
+ int i;
+ char *msg, *p, *buf_end;
+ int line_feed;
+ static signed char msg_level = -1;
+ long flags;
+
+ spin_lock_irqsave(&console_lock, flags);
+ va_start(args, fmt);
+ i = vsprintf(buf + 3, fmt, args); /* hopefully i < sizeof(buf)-4 */
+ buf_end = buf + 3 + i;
+ va_end(args);
+ for (p = buf + 3; p < buf_end; p++) {
+ msg = p;
+ if (msg_level < 0) {
+ if (
+ p[0] != '<' ||
+ p[1] < '0' ||
+ p[1] > '7' ||
+ p[2] != '>'
+ ) {
+ p -= 3;
+ p[0] = '<';
+ p[1] = default_message_loglevel + '0';
+ p[2] = '>';
+ } else
+ msg += 3;
+ msg_level = p[1] - '0';
+ }
+ line_feed = 0;
+ for (; p < buf_end; p++) {
+ log_buf[(log_start+log_size) & LOG_BUF_MASK] = *p;
+ if (log_size < LOG_BUF_LEN)
+ log_size++;
+ else
+ log_start++;
+
+ logged_chars++;
+ if (*p == '\n') {
+ line_feed = 1;
+ break;
+ }
+ }
+ if (msg_level < console_loglevel && console_drivers) {
+ struct console *c = console_drivers;
+ while(c) {
+ if ((c->flags & CON_ENABLED) && c->write)
+ c->write(c, msg, p - msg + line_feed);
+ c = c->next;
+ }
+ }
+ if (line_feed)
+ msg_level = -1;
+ }
+ spin_unlock_irqrestore(&console_lock, flags);
+ wake_up_interruptible(&log_wait);
+ return i;
+}
+
+void console_print(const char *s)
+{
+ struct console *c;
+ unsigned long flags;
+ int len = strlen(s);
+
+ spin_lock_irqsave(&console_lock, flags);
+ c = console_drivers;
+ while(c) {
+ if ((c->flags & CON_ENABLED) && c->write)
+ c->write(c, s, len);
+ c = c->next;
+ }
+ spin_unlock_irqrestore(&console_lock, flags);
+}
+
+void unblank_console(void)
+{
+ struct console *c;
+ unsigned long flags;
+
+ spin_lock_irqsave(&console_lock, flags);
+ c = console_drivers;
+ while(c) {
+ if ((c->flags & CON_ENABLED) && c->unblank)
+ c->unblank();
+ c = c->next;
+ }
+ spin_unlock_irqrestore(&console_lock, flags);
+}
+
+/*
+ * The console driver calls this routine during kernel initialization
+ * to register the console printing procedure with printk() and to
+ * print any messages that were printed by the kernel before the
+ * console driver was initialized.
+ */
+void register_console(struct console * console)
+{
+ int i, j,len;
+ int p;
+ char buf[16];
+ signed char msg_level = -1;
+ char *q;
+ unsigned long flags;
+
+ /*
+ * See if we want to use this console driver. If we
+ * didn't select a console we take the first one
+ * that registers here.
+ */
+ if (preferred_console < 0) {
+ if (console->index < 0)
+ console->index = 0;
+ if (console->setup == NULL ||
+ console->setup(console, NULL) == 0) {
+ console->flags |= CON_ENABLED | CON_CONSDEV;
+ preferred_console = 0;
+ }
+ }
+
+ /*
+ * See if this console matches one we selected on
+ * the command line.
+ */
+ for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+ if (strcmp(console_cmdline[i].name, console->name) != 0)
+ continue;
+ if (console->index >= 0 &&
+ console->index != console_cmdline[i].index)
+ continue;
+ if (console->index < 0)
+ console->index = console_cmdline[i].index;
+ if (console->setup &&
+ console->setup(console, console_cmdline[i].options) != 0)
+ break;
+ console->flags |= CON_ENABLED;
+ console->index = console_cmdline[i].index;
+ if (i == preferred_console)
+ console->flags |= CON_CONSDEV;
+ break;
+ }
+
+ if (!(console->flags & CON_ENABLED))
+ return;
+
+ /*
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
+ */
+ spin_lock_irqsave(&console_lock, flags);
+ if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
+ console->next = console_drivers;
+ console_drivers = console;
+ } else {
+ console->next = console_drivers->next;
+ console_drivers->next = console;
+ }
+ if ((console->flags & CON_PRINTBUFFER) == 0)
+ goto done;
+ /*
+ * Print out buffered log messages.
+ */
+ p = log_start & LOG_BUF_MASK;
+
+ for (i=0,j=0; i < log_size; i++) {
+ buf[j++] = log_buf[p];
+ p = (p+1) & LOG_BUF_MASK;
+ if (buf[j-1] != '\n' && i < log_size - 1 && j < sizeof(buf)-1)
+ continue;
+ buf[j] = 0;
+ q = buf;
+ len = j;
+ if (msg_level < 0) {
+ if(buf[0] == '<' &&
+ buf[1] >= '0' &&
+ buf[1] <= '7' &&
+ buf[2] == '>') {
+ msg_level = buf[1] - '0';
+ q = buf + 3;
+ len -= 3;
+ } else
+ {
+ msg_level = default_message_loglevel;
+ }
+ }
+ if (msg_level < console_loglevel)
+ console->write(console, q, len);
+ if (buf[j-1] == '\n')
+ msg_level = -1;
+ j = 0;
+ }
+done:
+ spin_unlock_irqrestore(&console_lock, flags);
+}
+
+
+int unregister_console(struct console * console)
+{
+ struct console *a,*b;
+ unsigned long flags;
+ int res = 1;
+
+ spin_lock_irqsave(&console_lock, flags);
+ if (console_drivers == console) {
+ console_drivers=console->next;
+ res = 0;
+ } else
+ {
+ for (a=console_drivers->next, b=console_drivers ;
+ a; b=a, a=b->next) {
+ if (a == console) {
+ b->next = a->next;
+ res = 0;
+ break;
+ }
+ }
+ }
+
+ /* If last console is removed, we re-enable picking the first
+ * one that gets registered. Without that, pmac early boot console
+ * would prevent fbcon from taking over.
+ */
+ if (console_drivers == NULL)
+ preferred_console = -1;
+
+
+ spin_unlock_irqrestore(&console_lock, flags);
+ return res;
+}
+
+/*
+ * Write a message to a certain tty, not just the console. This is used for
+ * messages that need to be redirected to a specific tty.
+ * We don't put it into the syslog queue right now maybe in the future if
+ * really needed.
+ */
+void tty_write_message(struct tty_struct *tty, char *msg)
+{
+ if (tty && tty->driver.write)
+ tty->driver.write(tty, 0, msg, strlen(msg));
+ return;
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
new file mode 100644
index 000000000000..410f9de937bc
--- /dev/null
+++ b/kernel/ptrace.c
@@ -0,0 +1,193 @@
+/*
+ * linux/kernel/ptrace.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Common interfaces for "ptrace()" which we do not want
+ * to continually duplicate across every architecture.
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+
+/*
+ * Access another process' address space, one page at a time.
+ */
+static int access_one_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write)
+{
+ pgd_t * pgdir;
+ pmd_t * pgmiddle;
+ pte_t * pgtable;
+ char *maddr;
+ struct page *page;
+
+repeat:
+ pgdir = pgd_offset(vma->vm_mm, addr);
+ if (pgd_none(*pgdir))
+ goto fault_in_page;
+ if (pgd_bad(*pgdir))
+ goto bad_pgd;
+ pgmiddle = pmd_offset(pgdir, addr);
+ if (pmd_none(*pgmiddle))
+ goto fault_in_page;
+ if (pmd_bad(*pgmiddle))
+ goto bad_pmd;
+ pgtable = pte_offset(pgmiddle, addr);
+ if (!pte_present(*pgtable))
+ goto fault_in_page;
+ if (write && (!pte_write(*pgtable) || !pte_dirty(*pgtable)))
+ goto fault_in_page;
+ page = pte_page(*pgtable);
+
+ /* ZERO_PAGE is special: reads from it are ok even though it's marked reserved */
+ if (page != ZERO_PAGE(addr) || write) {
+ if ((!VALID_PAGE(page)) || PageReserved(page))
+ return 0;
+ }
+ flush_cache_page(vma, addr);
+
+ if (write) {
+ maddr = kmap(page);
+ memcpy(maddr + (addr & ~PAGE_MASK), buf, len);
+ flush_page_to_ram(page);
+ flush_icache_page(vma, page);
+ kunmap(page);
+ } else {
+ maddr = kmap(page);
+ memcpy(buf, maddr + (addr & ~PAGE_MASK), len);
+ flush_page_to_ram(page);
+ kunmap(page);
+ }
+ return len;
+
+fault_in_page:
+ /* -1: out of memory. 0 - unmapped page */
+ if (handle_mm_fault(mm, vma, addr, write) > 0)
+ goto repeat;
+ return 0;
+
+bad_pgd:
+ pgd_ERROR(*pgdir);
+ return 0;
+
+bad_pmd:
+ pmd_ERROR(*pgmiddle);
+ return 0;
+}
+
+static int access_mm(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write)
+{
+ int copied = 0;
+
+ for (;;) {
+ unsigned long offset = addr & ~PAGE_MASK;
+ int this_len = PAGE_SIZE - offset;
+ int retval;
+
+ if (this_len > len)
+ this_len = len;
+ retval = access_one_page(mm, vma, addr, buf, this_len, write);
+ copied += retval;
+ if (retval != this_len)
+ break;
+
+ len -= retval;
+ if (!len)
+ break;
+
+ addr += retval;
+ buf += retval;
+
+ if (addr < vma->vm_end)
+ continue;
+ if (!vma->vm_next)
+ break;
+ if (vma->vm_next->vm_start != vma->vm_end)
+ break;
+
+ vma = vma->vm_next;
+ }
+ return copied;
+}
+
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ int copied;
+ struct mm_struct *mm;
+ struct vm_area_struct * vma;
+
+ /* Worry about races with exit() */
+ task_lock(tsk);
+ mm = tsk->mm;
+ if (mm)
+ atomic_inc(&mm->mm_users);
+ task_unlock(tsk);
+ if (!mm)
+ return 0;
+
+ down(&mm->mmap_sem);
+ vma = find_extend_vma(mm, addr);
+ copied = 0;
+ if (vma)
+ copied = access_mm(mm, vma, addr, buf, len, write);
+
+ up(&mm->mmap_sem);
+ mmput(mm);
+ return copied;
+}
+
+int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len)
+{
+ int copied = 0;
+
+ while (len > 0) {
+ char buf[128];
+ int this_len, retval;
+
+ this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+ retval = access_process_vm(tsk, src, buf, this_len, 0);
+ if (!retval) {
+ if (copied)
+ break;
+ return -EIO;
+ }
+ if (copy_to_user(dst, buf, retval))
+ return -EFAULT;
+ copied += retval;
+ src += retval;
+ dst += retval;
+ len -= retval;
+ }
+ return copied;
+}
+
+int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len)
+{
+ int copied = 0;
+
+ while (len > 0) {
+ char buf[128];
+ int this_len, retval;
+
+ this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+ if (copy_from_user(buf, src, this_len))
+ return -EFAULT;
+ retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ if (!retval) {
+ if (copied)
+ break;
+ return -EIO;
+ }
+ copied += retval;
+ src += retval;
+ dst += retval;
+ len -= retval;
+ }
+ return copied;
+}
diff --git a/kernel/resource.c b/kernel/resource.c
new file mode 100644
index 000000000000..b553eb0ff2e9
--- /dev/null
+++ b/kernel/resource.c
@@ -0,0 +1,322 @@
+/*
+ * linux/kernel/resource.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 1999 Martin Mares <mj@ucw.cz>
+ *
+ * Arbitrary resource management.
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/malloc.h>
+#include <linux/spinlock.h>
+#include <asm/io.h>
+
+struct resource ioport_resource = { "PCI IO", 0x0000, IO_SPACE_LIMIT, IORESOURCE_IO };
+struct resource iomem_resource = { "PCI mem", 0x00000000, 0xffffffff, IORESOURCE_MEM };
+
+static rwlock_t resource_lock = RW_LOCK_UNLOCKED;
+
+/*
+ * This generates reports for /proc/ioports and /proc/iomem
+ */
+static char * do_resource_list(struct resource *entry, const char *fmt, int offset, char *buf, char *end)
+{
+ if (offset < 0)
+ offset = 0;
+
+ while (entry) {
+ const char *name = entry->name;
+ unsigned long from, to;
+
+ if ((int) (end-buf) < 80)
+ return buf;
+
+ from = entry->start;
+ to = entry->end;
+ if (!name)
+ name = "<BAD>";
+
+ buf += sprintf(buf, fmt + offset, from, to, name);
+ if (entry->child)
+ buf = do_resource_list(entry->child, fmt, offset-2, buf, end);
+ entry = entry->sibling;
+ }
+
+ return buf;
+}
+
+int get_resource_list(struct resource *root, char *buf, int size)
+{
+ char *fmt;
+ int retval;
+
+ fmt = " %08lx-%08lx : %s\n";
+ if (root->end < 0x10000)
+ fmt = " %04lx-%04lx : %s\n";
+ read_lock(&resource_lock);
+ retval = do_resource_list(root->child, fmt, 8, buf, buf + size) - buf;
+ read_unlock(&resource_lock);
+ return retval;
+}
+
+/* Return the conflict entry if you can't request it */
+static struct resource * __request_resource(struct resource *root, struct resource *new)
+{
+ unsigned long start = new->start;
+ unsigned long end = new->end;
+ struct resource *tmp, **p;
+
+ if (end < start)
+ return root;
+ if (start < root->start)
+ return root;
+ if (end > root->end)
+ return root;
+ p = &root->child;
+ for (;;) {
+ tmp = *p;
+ if (!tmp || tmp->start > end) {
+ new->sibling = tmp;
+ *p = new;
+ new->parent = root;
+ return NULL;
+ }
+ p = &tmp->sibling;
+ if (tmp->end < start)
+ continue;
+ return tmp;
+ }
+}
+
+static int __release_resource(struct resource *old)
+{
+ struct resource *tmp, **p;
+
+ p = &old->parent->child;
+ for (;;) {
+ tmp = *p;
+ if (!tmp)
+ break;
+ if (tmp == old) {
+ *p = tmp->sibling;
+ old->parent = NULL;
+ return 0;
+ }
+ p = &tmp->sibling;
+ }
+ return -EINVAL;
+}
+
+int request_resource(struct resource *root, struct resource *new)
+{
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+ conflict = __request_resource(root, new);
+ write_unlock(&resource_lock);
+ return conflict ? -EBUSY : 0;
+}
+
+int release_resource(struct resource *old)
+{
+ int retval;
+
+ write_lock(&resource_lock);
+ retval = __release_resource(old);
+ write_unlock(&resource_lock);
+ return retval;
+}
+
+int check_resource(struct resource *root, unsigned long start, unsigned long len)
+{
+ struct resource *conflict, tmp;
+
+ tmp.start = start;
+ tmp.end = start + len - 1;
+ write_lock(&resource_lock);
+ conflict = __request_resource(root, &tmp);
+ if (!conflict)
+ __release_resource(&tmp);
+ write_unlock(&resource_lock);
+ return conflict ? -EBUSY : 0;
+}
+
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+ unsigned long size,
+ unsigned long min, unsigned long max,
+ unsigned long align,
+ void (*alignf)(void *, struct resource *, unsigned long),
+ void *alignf_data)
+{
+ struct resource *this = root->child;
+
+ new->start = root->start;
+ for(;;) {
+ if (this)
+ new->end = this->start;
+ else
+ new->end = root->end;
+ if (new->start < min)
+ new->start = min;
+ if (new->end > max)
+ new->end = max;
+ new->start = (new->start + align - 1) & ~(align - 1);
+ if (alignf)
+ alignf(alignf_data, new, size);
+ if (new->start < new->end && new->end - new->start + 1 >= size) {
+ new->end = new->start + size - 1;
+ return 0;
+ }
+ if (!this)
+ break;
+ new->start = this->end + 1;
+ this = this->sibling;
+ }
+ return -EBUSY;
+}
+
+/*
+ * Allocate empty slot in the resource tree given range and alignment.
+ */
+int allocate_resource(struct resource *root, struct resource *new,
+ unsigned long size,
+ unsigned long min, unsigned long max,
+ unsigned long align,
+ void (*alignf)(void *, struct resource *, unsigned long),
+ void *alignf_data)
+{
+ int err;
+
+ write_lock(&resource_lock);
+ err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+ if (err >= 0 && __request_resource(root, new))
+ err = -EBUSY;
+ write_unlock(&resource_lock);
+ return err;
+}
+
+/*
+ * This is compatibility stuff for IO resources.
+ *
+ * Note how this, unlike the above, knows about
+ * the IO flag meanings (busy etc).
+ *
+ * Request-region creates a new busy region.
+ *
+ * Check-region returns non-zero if the area is already busy
+ *
+ * Release-region releases a matching busy region.
+ */
+struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
+{
+ struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+
+ if (res) {
+ memset(res, 0, sizeof(*res));
+ res->name = name;
+ res->start = start;
+ res->end = start + n - 1;
+ res->flags = IORESOURCE_BUSY;
+
+ write_lock(&resource_lock);
+
+ for (;;) {
+ struct resource *conflict;
+
+ conflict = __request_resource(parent, res);
+ if (!conflict)
+ break;
+ if (conflict != parent) {
+ parent = conflict;
+ if (!(conflict->flags & IORESOURCE_BUSY))
+ continue;
+ }
+
+ /* Uhhuh, that didn't work out.. */
+ kfree(res);
+ res = NULL;
+ break;
+ }
+ write_unlock(&resource_lock);
+ }
+ return res;
+}
+
+int __check_region(struct resource *parent, unsigned long start, unsigned long n)
+{
+ struct resource * res;
+
+ res = __request_region(parent, start, n, "check-region");
+ if (!res)
+ return -EBUSY;
+
+ release_resource(res);
+ kfree(res);
+ return 0;
+}
+
+void __release_region(struct resource *parent, unsigned long start, unsigned long n)
+{
+ struct resource **p;
+ unsigned long end;
+
+ p = &parent->child;
+ end = start + n - 1;
+
+ for (;;) {
+ struct resource *res = *p;
+
+ if (!res)
+ break;
+ if (res->start <= start && res->end >= end) {
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+ if (res->start != start || res->end != end)
+ break;
+ *p = res->sibling;
+ kfree(res);
+ return;
+ }
+ p = &res->sibling;
+ }
+ printk("Trying to free nonexistent resource <%08lx-%08lx>\n", start, end);
+}
+
+/*
+ * Called from init/main.c to reserve IO ports.
+ */
+#define MAXRESERVE 4
+static int __init reserve_setup(char *str)
+{
+ int opt = 2, io_start, io_num;
+ static int reserved = 0;
+ static struct resource reserve[MAXRESERVE];
+
+ while (opt==2) {
+ int x = reserved;
+
+ if (get_option (&str, &io_start) != 2) break;
+ if (get_option (&str, &io_num) == 0) break;
+ if (x < MAXRESERVE) {
+ struct resource *res = reserve + x;
+ res->name = "reserved";
+ res->start = io_start;
+ res->end = io_start + io_num - 1;
+ res->child = NULL;
+ if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+ reserved = x+1;
+ }
+ }
+ return 1;
+}
+
+__setup("reserve=", reserve_setup);
diff --git a/kernel/sched.c b/kernel/sched.c
new file mode 100644
index 000000000000..bc2dcfa70cfc
--- /dev/null
+++ b/kernel/sched.c
@@ -0,0 +1,1269 @@
+/*
+ * linux/kernel/sched.c
+ *
+ * Kernel scheduler and related syscalls
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
+ */
+
+/*
+ * 'sched.c' is the main kernel file. It contains scheduling primitives
+ * (sleep_on, wakeup, schedule etc) as well as a number of simple system
+ * call functions (type getpid()), which just extract a field from
+ * current-task
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+
+extern void timer_bh(void);
+extern void tqueue_bh(void);
+extern void immediate_bh(void);
+
+/*
+ * scheduler variables
+ */
+
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+
+extern void mem_use(void);
+
+/*
+ * Scheduling quanta.
+ *
+ * NOTE! The unix "nice" value influences how long a process
+ * gets. The nice value ranges from -20 to +19, where a -20
+ * is a "high-priority" task, and a "+10" is a low-priority
+ * task.
+ *
+ * We want the time-slice to be around 50ms or so, so this
+ * calculation depends on the value of HZ.
+ */
+#if HZ < 200
+#define TICK_SCALE(x) ((x) >> 2)
+#elif HZ < 400
+#define TICK_SCALE(x) ((x) >> 1)
+#elif HZ < 800
+#define TICK_SCALE(x) (x)
+#elif HZ < 1600
+#define TICK_SCALE(x) ((x) << 1)
+#else
+#define TICK_SCALE(x) ((x) << 2)
+#endif
+
+#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
+
+
+/*
+ * Init task must be ok at boot for the ix86 as we will check its signals
+ * via the SMP irq return path.
+ */
+
+struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
+
+/*
+ * The tasklist_lock protects the linked list of processes.
+ *
+ * The runqueue_lock locks the parts that actually access
+ * and change the run-queues, and have to be interrupt-safe.
+ *
+ * If both locks are to be concurrently held, the runqueue_lock
+ * nests inside the tasklist_lock.
+ */
+spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
+
+static LIST_HEAD(runqueue_head);
+
+/*
+ * We align per-CPU scheduling data on cacheline boundaries,
+ * to prevent cacheline ping-pong.
+ */
+static union {
+ struct schedule_data {
+ struct task_struct * curr;
+ cycles_t last_schedule;
+ } schedule_data;
+ char __pad [SMP_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+
+struct kernel_stat kstat;
+
+#ifdef CONFIG_SMP
+
+#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
+#define can_schedule(p,cpu) ((!(p)->has_cpu) && \
+ ((p)->cpus_allowed & (1 << cpu)))
+
+#else
+
+#define idle_task(cpu) (&init_task)
+#define can_schedule(p,cpu) (1)
+
+#endif
+
+void scheduling_functions_start_here(void) { }
+
+/*
+ * This is the function that decides how desirable a process is..
+ * You can weigh different processes against each other depending
+ * on what CPU they've run on lately etc to try to handle cache
+ * and TLB miss penalties.
+ *
+ * Return values:
+ * -1000: never select this
+ * 0: out of time, recalculate counters (but it might still be
+ * selected)
+ * +ve: "goodness" value (the larger, the better)
+ * +1000: realtime process, select this.
+ */
+
+static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+{
+ int weight;
+
+ /*
+ * select the current process after every other
+ * runnable process, but before the idle thread.
+ * Also, dont trigger a counter recalculation.
+ */
+ weight = -1;
+ if (p->policy & SCHED_YIELD)
+ goto out;
+
+ /*
+ * Non-RT process - normal case first.
+ */
+ if (p->policy == SCHED_OTHER) {
+ /*
+ * Give the process a first-approximation goodness value
+ * according to the number of clock-ticks it has left.
+ *
+ * Don't do any other calculations if the time slice is
+ * over..
+ */
+ weight = p->counter;
+ if (!weight)
+ goto out;
+
+#ifdef CONFIG_SMP
+ /* Give a largish advantage to the same processor... */
+ /* (this is equivalent to penalizing other processors) */
+ if (p->processor == this_cpu)
+ weight += PROC_CHANGE_PENALTY;
+#endif
+
+ /* .. and a slight advantage to the current MM */
+ if (p->mm == this_mm || !p->mm)
+ weight += 1;
+ weight += 20 - p->nice;
+ goto out;
+ }
+
+ /*
+ * Realtime process, select the first one on the
+ * runqueue (taking priorities within processes
+ * into account).
+ */
+ weight = 1000 + p->rt_priority;
+out:
+ return weight;
+}
+
+/*
+ * the 'goodness value' of replacing a process on a given CPU.
+ * positive value means 'replace', zero or negative means 'dont'.
+ */
+static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+{
+ return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+}
+
+/*
+ * This is ugly, but reschedule_idle() is very timing-critical.
+ * We are called with the runqueue spinlock held and we must
+ * not claim the tasklist_lock.
+ */
+static FASTCALL(void reschedule_idle(struct task_struct * p));
+
+static void reschedule_idle(struct task_struct * p)
+{
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+ struct task_struct *tsk, *target_tsk;
+ int cpu, best_cpu, i, max_prio;
+ cycles_t oldest_idle;
+
+ /*
+ * shortcut if the woken up task's last CPU is
+ * idle now.
+ */
+ best_cpu = p->processor;
+ if (can_schedule(p, best_cpu)) {
+ tsk = idle_task(best_cpu);
+ if (cpu_curr(best_cpu) == tsk) {
+ int need_resched;
+send_now_idle:
+ /*
+ * If need_resched == -1 then we can skip sending
+ * the IPI altogether, tsk->need_resched is
+ * actively watched by the idle thread.
+ */
+ need_resched = tsk->need_resched;
+ tsk->need_resched = 1;
+ if ((best_cpu != this_cpu) && !need_resched)
+ smp_send_reschedule(best_cpu);
+ return;
+ }
+ }
+
+ /*
+ * We know that the preferred CPU has a cache-affine current
+ * process, lets try to find a new idle CPU for the woken-up
+ * process. Select the least recently active idle CPU. (that
+ * one will have the least active cache context.) Also find
+ * the executing process which has the least priority.
+ */
+ oldest_idle = (cycles_t) -1;
+ target_tsk = NULL;
+ max_prio = 1;
+
+ for (i = 0; i < smp_num_cpus; i++) {
+ cpu = cpu_logical_map(i);
+ if (!can_schedule(p, cpu))
+ continue;
+ tsk = cpu_curr(cpu);
+ /*
+ * We use the first available idle CPU. This creates
+ * a priority list between idle CPUs, but this is not
+ * a problem.
+ */
+ if (tsk == idle_task(cpu)) {
+ if (last_schedule(cpu) < oldest_idle) {
+ oldest_idle = last_schedule(cpu);
+ target_tsk = tsk;
+ }
+ } else {
+ if (oldest_idle == -1ULL) {
+ int prio = preemption_goodness(tsk, p, cpu);
+
+ if (prio > max_prio) {
+ max_prio = prio;
+ target_tsk = tsk;
+ }
+ }
+ }
+ }
+ tsk = target_tsk;
+ if (tsk) {
+ if (oldest_idle != -1ULL) {
+ best_cpu = tsk->processor;
+ goto send_now_idle;
+ }
+ tsk->need_resched = 1;
+ if (tsk->processor != this_cpu)
+ smp_send_reschedule(tsk->processor);
+ }
+ return;
+
+
+#else /* UP */
+ int this_cpu = smp_processor_id();
+ struct task_struct *tsk;
+
+ tsk = cpu_curr(this_cpu);
+ if (preemption_goodness(tsk, p, this_cpu) > 1)
+ tsk->need_resched = 1;
+#endif
+}
+
+/*
+ * Careful!
+ *
+ * This has to add the process to the _beginning_ of the
+ * run-queue, not the end. See the comment about "This is
+ * subtle" in the scheduler proper..
+ */
+static inline void add_to_runqueue(struct task_struct * p)
+{
+ list_add(&p->run_list, &runqueue_head);
+ nr_running++;
+}
+
+static inline void move_last_runqueue(struct task_struct * p)
+{
+ list_del(&p->run_list);
+ list_add_tail(&p->run_list, &runqueue_head);
+}
+
+static inline void move_first_runqueue(struct task_struct * p)
+{
+ list_del(&p->run_list);
+ list_add(&p->run_list, &runqueue_head);
+}
+
+/*
+ * Wake up a process. Put it on the run-queue if it's not
+ * already there. The "current" process is always on the
+ * run-queue (except when the actual re-schedule is in
+ * progress), and as such you're allowed to do the simpler
+ * "current->state = TASK_RUNNING" to mark yourself runnable
+ * without the overhead of this.
+ */
+inline void wake_up_process(struct task_struct * p)
+{
+ unsigned long flags;
+
+ /*
+ * We want the common case fall through straight, thus the goto.
+ */
+ spin_lock_irqsave(&runqueue_lock, flags);
+ p->state = TASK_RUNNING;
+ if (task_on_runqueue(p))
+ goto out;
+ add_to_runqueue(p);
+ reschedule_idle(p);
+out:
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+}
+
+static inline void wake_up_process_synchronous(struct task_struct * p)
+{
+ unsigned long flags;
+
+ /*
+ * We want the common case fall through straight, thus the goto.
+ */
+ spin_lock_irqsave(&runqueue_lock, flags);
+ p->state = TASK_RUNNING;
+ if (task_on_runqueue(p))
+ goto out;
+ add_to_runqueue(p);
+out:
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+}
+
+static void process_timeout(unsigned long __data)
+{
+ struct task_struct * p = (struct task_struct *) __data;
+
+ wake_up_process(p);
+}
+
+signed long schedule_timeout(signed long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ switch (timeout)
+ {
+ case MAX_SCHEDULE_TIMEOUT:
+ /*
+ * These two special cases are useful to be comfortable
+ * in the caller. Nothing more. We could take
+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
+ * but I' d like to return a valid offset (>=0) to allow
+ * the caller to do everything it want with the retval.
+ */
+ schedule();
+ goto out;
+ default:
+ /*
+ * Another bit of PARANOID. Note that the retval will be
+ * 0 since no piece of kernel is supposed to do a check
+ * for a negative retval of schedule_timeout() (since it
+ * should never happens anyway). You just have the printk()
+ * that will tell you if something is gone wrong and where.
+ */
+ if (timeout < 0)
+ {
+ printk(KERN_ERR "schedule_timeout: wrong timeout "
+ "value %lx from %p\n", timeout,
+ __builtin_return_address(0));
+ current->state = TASK_RUNNING;
+ goto out;
+ }
+ }
+
+ expire = timeout + jiffies;
+
+ init_timer(&timer);
+ timer.expires = expire;
+ timer.data = (unsigned long) current;
+ timer.function = process_timeout;
+
+ add_timer(&timer);
+ schedule();
+ del_timer_sync(&timer);
+
+ timeout = expire - jiffies;
+
+ out:
+ return timeout < 0 ? 0 : timeout;
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+static inline void __schedule_tail(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ int policy;
+
+ /*
+ * prev->policy can be written from here only before `prev'
+ * can be scheduled (before setting prev->has_cpu to zero).
+ * Of course it must also be read before allowing prev
+ * to be rescheduled, but since the write depends on the read
+ * to complete, wmb() is enough. (the spin_lock() acquired
+ * before setting has_cpu is not enough because the spin_lock()
+ * common code semantics allows code outside the critical section
+ * to enter inside the critical section)
+ */
+ policy = prev->policy;
+ prev->policy = policy & ~SCHED_YIELD;
+ wmb();
+
+ /*
+ * fast path falls through. We have to clear has_cpu before
+ * checking prev->state to avoid a wakeup race - thus we
+ * also have to protect against the task exiting early.
+ */
+ task_lock(prev);
+ prev->has_cpu = 0;
+ mb();
+ if (prev->state == TASK_RUNNING)
+ goto needs_resched;
+
+out_unlock:
+ task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
+ return;
+
+ /*
+ * Slow path - we 'push' the previous process and
+ * reschedule_idle() will attempt to find a new
+ * processor for it. (but it might preempt the
+ * current process as well.) We must take the runqueue
+ * lock and re-check prev->state to be correct. It might
+ * still happen that this process has a preemption
+ * 'in progress' already - but this is not a problem and
+ * might happen in other circumstances as well.
+ */
+needs_resched:
+ {
+ unsigned long flags;
+
+ /*
+ * Avoid taking the runqueue lock in cases where
+ * no preemption-check is necessery:
+ */
+ if ((prev == idle_task(smp_processor_id())) ||
+ (policy & SCHED_YIELD))
+ goto out_unlock;
+
+ spin_lock_irqsave(&runqueue_lock, flags);
+ if (prev->state == TASK_RUNNING)
+ reschedule_idle(prev);
+ spin_unlock_irqrestore(&runqueue_lock, flags);
+ goto out_unlock;
+ }
+#else
+ prev->policy &= ~SCHED_YIELD;
+#endif /* CONFIG_SMP */
+}
+
+void schedule_tail(struct task_struct *prev)
+{
+ __schedule_tail(prev);
+}
+
+/*
+ * 'schedule()' is the scheduler function. It's a very simple and nice
+ * scheduler: it's not perfect, but certainly works for most things.
+ *
+ * The goto is "interesting".
+ *
+ * NOTE!! Task 0 is the 'idle' task, which gets called when no other
+ * tasks can run. It can not be killed, and it cannot sleep. The 'state'
+ * information in task[0] is never used.
+ */
+asmlinkage void schedule(void)
+{
+ struct schedule_data * sched_data;
+ struct task_struct *prev, *next, *p;
+ struct list_head *tmp;
+ int this_cpu, c;
+
+ if (!current->active_mm) BUG();
+need_resched_back:
+ prev = current;
+ this_cpu = prev->processor;
+
+ if (in_interrupt())
+ goto scheduling_in_interrupt;
+
+ release_kernel_lock(prev, this_cpu);
+
+ /* Do "administrative" work here while we don't hold any locks */
+ if (softirq_active(this_cpu) & softirq_mask(this_cpu))
+ goto handle_softirq;
+handle_softirq_back:
+
+ /*
+ * 'sched_data' is protected by the fact that we can run
+ * only one process per CPU.
+ */
+ sched_data = & aligned_data[this_cpu].schedule_data;
+
+ spin_lock_irq(&runqueue_lock);
+
+ /* move an exhausted RR process to be last.. */
+ if (prev->policy == SCHED_RR)
+ goto move_rr_last;
+move_rr_back:
+
+ switch (prev->state) {
+ case TASK_INTERRUPTIBLE:
+ if (signal_pending(prev)) {
+ prev->state = TASK_RUNNING;
+ break;
+ }
+ default:
+ del_from_runqueue(prev);
+ case TASK_RUNNING:
+ }
+ prev->need_resched = 0;
+
+ /*
+ * this is the scheduler proper:
+ */
+
+repeat_schedule:
+ /*
+ * Default process to select..
+ */
+ next = idle_task(this_cpu);
+ c = -1000;
+ if (prev->state == TASK_RUNNING)
+ goto still_running;
+
+still_running_back:
+ list_for_each(tmp, &runqueue_head) {
+ p = list_entry(tmp, struct task_struct, run_list);
+ if (can_schedule(p, this_cpu)) {
+ int weight = goodness(p, this_cpu, prev->active_mm);
+ if (weight > c)
+ c = weight, next = p;
+ }
+ }
+
+ /* Do we need to re-calculate counters? */
+ if (!c)
+ goto recalculate;
+ /*
+ * from this point on nothing can prevent us from
+ * switching to the next task, save this fact in
+ * sched_data.
+ */
+ sched_data->curr = next;
+#ifdef CONFIG_SMP
+ next->has_cpu = 1;
+ next->processor = this_cpu;
+#endif
+ spin_unlock_irq(&runqueue_lock);
+
+ if (prev == next)
+ goto same_process;
+
+#ifdef CONFIG_SMP
+ /*
+ * maintain the per-process 'last schedule' value.
+ * (this has to be recalculated even if we reschedule to
+ * the same process) Currently this is only used on SMP,
+ * and it's approximate, so we do not have to maintain
+ * it while holding the runqueue spinlock.
+ */
+ sched_data->last_schedule = get_cycles();
+
+ /*
+ * We drop the scheduler lock early (it's a global spinlock),
+ * thus we have to lock the previous process from getting
+ * rescheduled during switch_to().
+ */
+
+#endif /* CONFIG_SMP */
+
+ kstat.context_swtch++;
+ /*
+ * there are 3 processes which are affected by a context switch:
+ *
+ * prev == .... ==> (last => next)
+ *
+ * It's the 'much more previous' 'prev' that is on next's stack,
+ * but prev is set to (the just run) 'last' process by switch_to().
+ * This might sound slightly confusing but makes tons of sense.
+ */
+ prepare_to_switch();
+ {
+ struct mm_struct *mm = next->mm;
+ struct mm_struct *oldmm = prev->active_mm;
+ if (!mm) {
+ if (next->active_mm) BUG();
+ next->active_mm = oldmm;
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next, this_cpu);
+ } else {
+ if (next->active_mm != mm) BUG();
+ switch_mm(oldmm, mm, next, this_cpu);
+ }
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+ mmdrop(oldmm);
+ }
+ }
+
+ /*
+ * This just switches the register state and the
+ * stack.
+ */
+ switch_to(prev, next, prev);
+ __schedule_tail(prev);
+
+same_process:
+ reacquire_kernel_lock(current);
+ if (current->need_resched)
+ goto need_resched_back;
+
+ return;
+
+recalculate:
+ {
+ struct task_struct *p;
+ spin_unlock_irq(&runqueue_lock);
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+ read_unlock(&tasklist_lock);
+ spin_lock_irq(&runqueue_lock);
+ }
+ goto repeat_schedule;
+
+still_running:
+ c = goodness(prev, this_cpu, prev->active_mm);
+ next = prev;
+ goto still_running_back;
+
+handle_softirq:
+ do_softirq();
+ goto handle_softirq_back;
+
+move_rr_last:
+ if (!prev->counter) {
+ prev->counter = NICE_TO_TICKS(prev->nice);
+ move_last_runqueue(prev);
+ }
+ goto move_rr_back;
+
+scheduling_in_interrupt:
+ printk("Scheduling in interrupt\n");
+ BUG();
+ return;
+}
+
+static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
+ unsigned int wq_mode, const int sync)
+{
+ struct list_head *tmp, *head;
+ struct task_struct *p, *best_exclusive;
+ unsigned long flags;
+ int best_cpu, irq;
+
+ if (!q)
+ goto out;
+
+ best_cpu = smp_processor_id();
+ irq = in_interrupt();
+ best_exclusive = NULL;
+ wq_write_lock_irqsave(&q->lock, flags);
+
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC_WQHEAD(q);
+#endif
+
+ head = &q->task_list;
+#if WAITQUEUE_DEBUG
+ if (!head->next || !head->prev)
+ WQ_BUG();
+#endif
+ tmp = head->next;
+ while (tmp != head) {
+ unsigned int state;
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+
+ tmp = tmp->next;
+
+#if WAITQUEUE_DEBUG
+ CHECK_MAGIC(curr->__magic);
+#endif
+ p = curr->task;
+ state = p->state;
+ if (state & mode) {
+#if WAITQUEUE_DEBUG
+ curr->__waker = (long)__builtin_return_address(0);
+#endif
+ /*
+ * If waking up from an interrupt context then
+ * prefer processes which are affine to this
+ * CPU.
+ */
+ if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) {
+ if (!best_exclusive)
+ best_exclusive = p;
+ if (p->processor == best_cpu) {
+ best_exclusive = p;
+ break;
+ }
+ } else {
+ if (sync)
+ wake_up_process_synchronous(p);
+ else
+ wake_up_process(p);
+ if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)
+ break;
+ }
+ }
+ }
+ if (best_exclusive) {
+ if (sync)
+ wake_up_process_synchronous(best_exclusive);
+ else
+ wake_up_process(best_exclusive);
+ }
+ wq_write_unlock_irqrestore(&q->lock, flags);
+out:
+ return;
+}
+
+void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
+{
+ __wake_up_common(q, mode, wq_mode, 0);
+}
+
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)
+{
+ __wake_up_common(q, mode, wq_mode, 1);
+}
+
+#define SLEEP_ON_VAR \
+ unsigned long flags; \
+ wait_queue_t wait; \
+ init_waitqueue_entry(&wait, current);
+
+#define SLEEP_ON_HEAD \
+ wq_write_lock_irqsave(&q->lock,flags); \
+ __add_wait_queue(q, &wait); \
+ wq_write_unlock(&q->lock);
+
+#define SLEEP_ON_TAIL \
+ wq_write_lock_irq(&q->lock); \
+ __remove_wait_queue(q, &wait); \
+ wq_write_unlock_irqrestore(&q->lock,flags);
+
+void interruptible_sleep_on(wait_queue_head_t *q)
+{
+ SLEEP_ON_VAR
+
+ current->state = TASK_INTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ schedule();
+ SLEEP_ON_TAIL
+}
+
+long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+ SLEEP_ON_VAR
+
+ current->state = TASK_INTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ timeout = schedule_timeout(timeout);
+ SLEEP_ON_TAIL
+
+ return timeout;
+}
+
+void sleep_on(wait_queue_head_t *q)
+{
+ SLEEP_ON_VAR
+
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ schedule();
+ SLEEP_ON_TAIL
+}
+
+long sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+ SLEEP_ON_VAR
+
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ SLEEP_ON_HEAD
+ timeout = schedule_timeout(timeout);
+ SLEEP_ON_TAIL
+
+ return timeout;
+}
+
+void scheduling_functions_end_here(void) { }
+
+#ifndef __alpha__
+
+/*
+ * This has been replaced by sys_setpriority. Maybe it should be
+ * moved into the arch dependent tree for those ports that require
+ * it for backward compatibility?
+ */
+
+asmlinkage long sys_nice(int increment)
+{
+ long newprio;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ if (increment < 0) {
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+ if (increment < -40)
+ increment = -40;
+ }
+ if (increment > 40)
+ increment = 40;
+
+ newprio = current->nice + increment;
+ if (newprio < -20)
+ newprio = -20;
+ if (newprio > 19)
+ newprio = 19;
+ current->nice = newprio;
+ return 0;
+}
+
+#endif
+
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+ struct task_struct *tsk = current;
+
+ if (pid)
+ tsk = find_task_by_pid(pid);
+ return tsk;
+}
+
+static int setscheduler(pid_t pid, int policy,
+ struct sched_param *param)
+{
+ struct sched_param lp;
+ struct task_struct *p;
+ int retval;
+
+ retval = -EINVAL;
+ if (!param || pid < 0)
+ goto out_nounlock;
+
+ retval = -EFAULT;
+ if (copy_from_user(&lp, param, sizeof(struct sched_param)))
+ goto out_nounlock;
+
+ /*
+ * We play safe to avoid deadlocks.
+ */
+ read_lock_irq(&tasklist_lock);
+ spin_lock(&runqueue_lock);
+
+ p = find_process_by_pid(pid);
+
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ if (policy < 0)
+ policy = p->policy;
+ else {
+ retval = -EINVAL;
+ if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_OTHER)
+ goto out_unlock;
+ }
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
+ * priority for SCHED_OTHER is 0.
+ */
+ retval = -EINVAL;
+ if (lp.sched_priority < 0 || lp.sched_priority > 99)
+ goto out_unlock;
+ if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
+ goto out_unlock;
+
+ retval = -EPERM;
+ if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+ !capable(CAP_SYS_NICE))
+ goto out_unlock;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_NICE))
+ goto out_unlock;
+
+ retval = 0;
+ p->policy = policy;
+ p->rt_priority = lp.sched_priority;
+ if (task_on_runqueue(p))
+ move_first_runqueue(p);
+
+ current->need_resched = 1;
+
+out_unlock:
+ spin_unlock(&runqueue_lock);
+ read_unlock_irq(&tasklist_lock);
+
+out_nounlock:
+ return retval;
+}
+
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+ struct sched_param *param)
+{
+ return setscheduler(pid, policy, param);
+}
+
+asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
+{
+ return setscheduler(pid, -1, param);
+}
+
+asmlinkage long sys_sched_getscheduler(pid_t pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ retval = -EINVAL;
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (p)
+ retval = p->policy & ~SCHED_YIELD;
+ read_unlock(&tasklist_lock);
+
+out_nounlock:
+ return retval;
+}
+
+asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
+{
+ struct task_struct *p;
+ struct sched_param lp;
+ int retval;
+
+ retval = -EINVAL;
+ if (!param || pid < 0)
+ goto out_nounlock;
+
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+ lp.sched_priority = p->rt_priority;
+ read_unlock(&tasklist_lock);
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+ return retval;
+
+out_unlock:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+asmlinkage long sys_sched_yield(void)
+{
+ /*
+ * Trick. sched_yield() first counts the number of truly
+ * 'pending' runnable processes, then returns if it's
+ * only the current processes. (This test does not have
+ * to be atomic.) In threaded applications this optimization
+ * gets triggered quite often.
+ */
+
+ int nr_pending = nr_running;
+
+#if CONFIG_SMP
+ int i;
+
+ // Substract non-idle processes running on other CPUs.
+ for (i = 0; i < smp_num_cpus; i++)
+ if (aligned_data[i].schedule_data.curr != idle_task(i))
+ nr_pending--;
+#else
+ // on UP this process is on the runqueue as well
+ nr_pending--;
+#endif
+ if (nr_pending) {
+ /*
+ * This process can only be rescheduled by us,
+ * so this is safe without any locking.
+ */
+ if (current->policy == SCHED_OTHER)
+ current->policy |= SCHED_YIELD;
+ current->need_resched = 1;
+ }
+ return 0;
+}
+
+asmlinkage long sys_sched_get_priority_max(int policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 99;
+ break;
+ case SCHED_OTHER:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+asmlinkage long sys_sched_get_priority_min(int policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_OTHER:
+ ret = 0;
+ }
+ return ret;
+}
+
+asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
+{
+ struct timespec t;
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_process_by_pid(pid);
+ if (p)
+ jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
+ &t);
+ read_unlock(&tasklist_lock);
+ if (p)
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+out_nounlock:
+ return retval;
+}
+
+static void show_task(struct task_struct * p)
+{
+ unsigned long free = 0;
+ int state;
+ static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };
+
+ printk("%-8s ", p->comm);
+ state = p->state ? ffz(~p->state) + 1 : 0;
+ if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
+ printk(stat_nam[state]);
+ else
+ printk(" ");
+#if (BITS_PER_LONG == 32)
+ if (p == current)
+ printk(" current ");
+ else
+ printk(" %08lX ", thread_saved_pc(&p->thread));
+#else
+ if (p == current)
+ printk(" current task ");
+ else
+ printk(" %016lx ", thread_saved_pc(&p->thread));
+#endif
+ {
+ unsigned long * n = (unsigned long *) (p+1);
+ while (!*n)
+ n++;
+ free = (unsigned long) n - (unsigned long)(p+1);
+ }
+ printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
+ if (p->p_cptr)
+ printk("%5d ", p->p_cptr->pid);
+ else
+ printk(" ");
+ if (!p->mm)
+ printk(" (L-TLB) ");
+ else
+ printk(" (NOTLB) ");
+ if (p->p_ysptr)
+ printk("%7d", p->p_ysptr->pid);
+ else
+ printk(" ");
+ if (p->p_osptr)
+ printk(" %5d\n", p->p_osptr->pid);
+ else
+ printk("\n");
+
+#ifdef CONFIG_X86
+/* This is very useful, but only works on x86 right now */
+ {
+ extern void show_trace(unsigned long);
+ show_trace(p->thread.esp);
+ }
+#endif
+}
+
+char * render_sigset_t(sigset_t *set, char *buffer)
+{
+ int i = _NSIG, x;
+ do {
+ i -= 4, x = 0;
+ if (sigismember(set, i+1)) x |= 1;
+ if (sigismember(set, i+2)) x |= 2;
+ if (sigismember(set, i+3)) x |= 4;
+ if (sigismember(set, i+4)) x |= 8;
+ *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
+ } while (i >= 4);
+ *buffer = 0;
+ return buffer;
+}
+
+void show_state(void)
+{
+ struct task_struct *p;
+
+#if (BITS_PER_LONG == 32)
+ printk("\n"
+ " free sibling\n");
+ printk(" task PC stack pid father child younger older\n");
+#else
+ printk("\n"
+ " free sibling\n");
+ printk(" task PC stack pid father child younger older\n");
+#endif
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ show_task(p);
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Put all the gunge required to become a kernel thread without
+ * attached user resources in one place where it belongs.
+ */
+
+void daemonize(void)
+{
+ struct fs_struct *fs;
+
+
+ /*
+ * If we were started as result of loading a module, close all of the
+ * user space pages. We don't need them, and if we didn't close them
+ * they would be locked into memory.
+ */
+ exit_mm(current);
+
+ current->session = 1;
+ current->pgrp = 1;
+
+ /* Become as one with the init task */
+
+ exit_fs(current); /* current->fs->count--; */
+ fs = init_task.fs;
+ current->fs = fs;
+ atomic_inc(&fs->count);
+ exit_files(current);
+ current->files = init_task.files;
+ atomic_inc(&current->files->count);
+}
+
+void __init init_idle(void)
+{
+ struct schedule_data * sched_data;
+ sched_data = &aligned_data[smp_processor_id()].schedule_data;
+
+ if (current != &init_task && task_on_runqueue(current)) {
+ printk("UGH! (%d:%d) was on the runqueue, removing.\n",
+ smp_processor_id(), current->pid);
+ del_from_runqueue(current);
+ }
+ sched_data->curr = current;
+ sched_data->last_schedule = get_cycles();
+}
+
+extern void init_timervecs (void);
+
+void __init sched_init(void)
+{
+ /*
+ * We have to do a little magic to get the first
+ * process right in SMP mode.
+ */
+ int cpu = smp_processor_id();
+ int nr;
+
+ init_task.processor = cpu;
+
+ for(nr = 0; nr < PIDHASH_SZ; nr++)
+ pidhash[nr] = NULL;
+
+ init_timervecs();
+
+ init_bh(TIMER_BH, timer_bh);
+ init_bh(TQUEUE_BH, tqueue_bh);
+ init_bh(IMMEDIATE_BH, immediate_bh);
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ atomic_inc(&init_mm.mm_count);
+ enter_lazy_tlb(&init_mm, current, cpu);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
new file mode 100644
index 000000000000..db22b0057738
--- /dev/null
+++ b/kernel/signal.c
@@ -0,0 +1,1260 @@
+/*
+ * linux/kernel/signal.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
+ */
+
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/unistd.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * SLAB caches for signal bits.
+ */
+
+#define DEBUG_SIG 0
+
+#if DEBUG_SIG
+#define SIG_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
+#else
+#define SIG_SLAB_DEBUG 0
+#endif
+
+static kmem_cache_t *sigqueue_cachep;
+
+atomic_t nr_queued_signals;
+int max_queued_signals = 1024;
+
+void __init signals_init(void)
+{
+ sigqueue_cachep =
+ kmem_cache_create("sigqueue",
+ sizeof(struct sigqueue),
+ __alignof__(struct sigqueue),
+ SIG_SLAB_DEBUG, NULL, NULL);
+ if (!sigqueue_cachep)
+ panic("signals_init(): cannot create sigqueue SLAB cache");
+}
+
+
+/* Given the mask, find the first available signal that should be serviced. */
+
+static int
+next_signal(struct task_struct *tsk, sigset_t *mask)
+{
+ unsigned long i, *s, *m, x;
+ int sig = 0;
+
+ s = tsk->pending.signal.sig;
+ m = mask->sig;
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+ if ((x = *s &~ *m) != 0) {
+ sig = ffz(~x) + i*_NSIG_BPW + 1;
+ break;
+ }
+ break;
+
+ case 2: if ((x = s[0] &~ m[0]) != 0)
+ sig = 1;
+ else if ((x = s[1] &~ m[1]) != 0)
+ sig = _NSIG_BPW + 1;
+ else
+ break;
+ sig += ffz(~x);
+ break;
+
+ case 1: if ((x = *s &~ *m) != 0)
+ sig = ffz(~x) + 1;
+ break;
+ }
+
+ return sig;
+}
+
+static void flush_sigqueue(struct sigpending *queue)
+{
+ struct sigqueue *q, *n;
+
+ sigemptyset(&queue->signal);
+ q = queue->head;
+ queue->head = NULL;
+ queue->tail = &queue->head;
+
+ while (q) {
+ n = q->next;
+ kmem_cache_free(sigqueue_cachep, q);
+ atomic_dec(&nr_queued_signals);
+ q = n;
+ }
+}
+
+/*
+ * Flush all pending signals for a task.
+ */
+
+void
+flush_signals(struct task_struct *t)
+{
+ t->sigpending = 0;
+ flush_sigqueue(&t->pending);
+}
+
+void exit_sighand(struct task_struct *tsk)
+{
+ struct signal_struct * sig = tsk->sig;
+
+ spin_lock_irq(&tsk->sigmask_lock);
+ if (sig) {
+ tsk->sig = NULL;
+ if (atomic_dec_and_test(&sig->count))
+ kmem_cache_free(sigact_cachep, sig);
+ }
+ tsk->sigpending = 0;
+ flush_sigqueue(&tsk->pending);
+ spin_unlock_irq(&tsk->sigmask_lock);
+}
+
+/*
+ * Flush all handlers for a task.
+ */
+
+void
+flush_signal_handlers(struct task_struct *t)
+{
+ int i;
+ struct k_sigaction *ka = &t->sig->action[0];
+ for (i = _NSIG ; i != 0 ; i--) {
+ if (ka->sa.sa_handler != SIG_IGN)
+ ka->sa.sa_handler = SIG_DFL;
+ ka->sa.sa_flags = 0;
+ sigemptyset(&ka->sa.sa_mask);
+ ka++;
+ }
+}
+
+/* Notify the system that a driver wants to block all signals for this
+ * process, and wants to be notified if any signals at all were to be
+ * sent/acted upon. If the notifier routine returns non-zero, then the
+ * signal will be acted upon after all. If the notifier routine returns 0,
+ * then then signal will be blocked. Only one block per process is
+ * allowed. priv is a pointer to private data that the notifier routine
+ * can use to determine if the signal should be blocked or not. */
+
+void
+block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sigmask_lock, flags);
+ current->notifier_mask = mask;
+ current->notifier_data = priv;
+ current->notifier = notifier;
+ spin_unlock_irqrestore(&current->sigmask_lock, flags);
+}
+
+/* Notify the system that blocking has ended. */
+
+void
+unblock_all_signals(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sigmask_lock, flags);
+ current->notifier = NULL;
+ current->notifier_data = NULL;
+ recalc_sigpending(current);
+ spin_unlock_irqrestore(&current->sigmask_lock, flags);
+}
+
+static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+{
+ if (sigismember(&list->signal, sig)) {
+ /* Collect the siginfo appropriate to this signal. */
+ struct sigqueue *q, **pp;
+ pp = &list->head;
+ while ((q = *pp) != NULL) {
+ if (q->info.si_signo == sig)
+ goto found_it;
+ pp = &q->next;
+ }
+
+ /* Ok, it wasn't in the queue. We must have
+ been out of queue space. So zero out the
+ info. */
+ sigdelset(&list->signal, sig);
+ info->si_signo = sig;
+ info->si_errno = 0;
+ info->si_code = 0;
+ info->si_pid = 0;
+ info->si_uid = 0;
+ return 1;
+
+found_it:
+ if ((*pp = q->next) == NULL)
+ list->tail = pp;
+
+ /* Copy the sigqueue information and free the queue entry */
+ copy_siginfo(info, &q->info);
+ kmem_cache_free(sigqueue_cachep,q);
+ atomic_dec(&nr_queued_signals);
+
+ /* Non-RT signals can exist multiple times.. */
+ if (sig >= SIGRTMIN) {
+ while ((q = *pp) != NULL) {
+ if (q->info.si_signo == sig)
+ goto found_another;
+ pp = &q->next;
+ }
+ }
+
+ sigdelset(&list->signal, sig);
+found_another:
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Dequeue a signal and return the element to the caller, which is
+ * expected to free it.
+ *
+ * All callers must be holding current->sigmask_lock.
+ */
+
+int
+dequeue_signal(sigset_t *mask, siginfo_t *info)
+{
+ int sig = 0;
+
+#if DEBUG_SIG
+printk("SIG dequeue (%s:%d): %d ", current->comm, current->pid,
+ signal_pending(current));
+#endif
+
+ sig = next_signal(current, mask);
+ if (current->notifier) {
+ if (sigismember(current->notifier_mask, sig)) {
+ if (!(current->notifier)(current->notifier_data)) {
+ current->sigpending = 0;
+ return 0;
+ }
+ }
+ }
+
+ if (sig) {
+ if (!collect_signal(sig, &current->pending, info))
+ sig = 0;
+
+ /* XXX: Once POSIX.1b timers are in, if si_code == SI_TIMER,
+ we need to xchg out the timer overrun values. */
+ }
+ recalc_sigpending(current);
+
+#if DEBUG_SIG
+printk(" %d -> %d\n", signal_pending(current), sig);
+#endif
+
+ return sig;
+}
+
+static int rm_from_queue(int sig, struct sigpending *s)
+{
+ struct sigqueue *q, **pp;
+
+ if (!sigismember(&s->signal, sig))
+ return 0;
+
+ sigdelset(&s->signal, sig);
+
+ pp = &s->head;
+
+ while ((q = *pp) != NULL) {
+ if (q->info.si_signo == sig) {
+ if ((*pp = q->next) == NULL)
+ s->tail = pp;
+ kmem_cache_free(sigqueue_cachep,q);
+ atomic_dec(&nr_queued_signals);
+ continue;
+ }
+ pp = &q->next;
+ }
+ return 1;
+}
+
+/*
+ * Remove signal sig from t->pending.
+ * Returns 1 if sig was found.
+ *
+ * All callers must be holding t->sigmask_lock.
+ */
+static int rm_sig_from_queue(int sig, struct task_struct *t)
+{
+ return rm_from_queue(sig, &t->pending);
+}
+
+/*
+ * Bad permissions for sending the signal
+ */
+int bad_signal(int sig, struct siginfo *info, struct task_struct *t)
+{
+ return (!info || ((unsigned long)info != 1 && SI_FROMUSER(info)))
+ && ((sig != SIGCONT) || (current->session != t->session))
+ && (current->euid ^ t->suid) && (current->euid ^ t->uid)
+ && (current->uid ^ t->suid) && (current->uid ^ t->uid)
+ && !capable(CAP_KILL);
+}
+
+/*
+ * Signal type:
+ * < 0 : global action (kill - spread to all non-blocked threads)
+ * = 0 : ignored
+ * > 0 : wake up.
+ */
+static int signal_type(int sig, struct signal_struct *signals)
+{
+ unsigned long handler;
+
+ if (!signals)
+ return 0;
+
+ handler = (unsigned long) signals->action[sig-1].sa.sa_handler;
+ if (handler > 1)
+ return 1;
+
+ /* "Ignore" handler.. Illogical, but that has an implicit handler for SIGCHLD */
+ if (handler == 1)
+ return sig == SIGCHLD;
+
+ /* Default handler. Normally lethal, but.. */
+ switch (sig) {
+
+ /* Ignored */
+ case SIGCONT: case SIGWINCH:
+ case SIGCHLD: case SIGURG:
+ return 0;
+
+ /* Implicit behaviour */
+ case SIGTSTP: case SIGTTIN: case SIGTTOU:
+ return 1;
+
+ /* Implicit actions (kill or do special stuff) */
+ default:
+ return -1;
+ }
+}
+
+
+/*
+ * Determine whether a signal should be posted or not.
+ *
+ * Signals with SIG_IGN can be ignored, except for the
+ * special case of a SIGCHLD.
+ *
+ * Some signals with SIG_DFL default to a non-action.
+ */
+static int ignored_signal(int sig, struct task_struct *t)
+{
+ /* Don't ignore traced or blocked signals */
+ if ((t->ptrace & PT_PTRACED) || sigismember(&t->blocked, sig))
+ return 0;
+
+ return signal_type(sig, t->sig) == 0;
+}
+
+/*
+ * Handle TASK_STOPPED cases etc implicit behaviour
+ * of certain magical signals.
+ *
+ * SIGKILL gets spread out to every thread.
+ */
+static void handle_stop_signal(int sig, struct task_struct *t)
+{
+ switch (sig) {
+ case SIGKILL: case SIGCONT:
+ /* Wake up the process if stopped. */
+ if (t->state == TASK_STOPPED)
+ wake_up_process(t);
+ t->exit_code = 0;
+ rm_sig_from_queue(SIGSTOP, t);
+ rm_sig_from_queue(SIGTSTP, t);
+ rm_sig_from_queue(SIGTTOU, t);
+ rm_sig_from_queue(SIGTTIN, t);
+ break;
+
+ case SIGSTOP: case SIGTSTP:
+ case SIGTTIN: case SIGTTOU:
+ /* If we're stopping again, cancel SIGCONT */
+ rm_sig_from_queue(SIGCONT, t);
+ break;
+ }
+}
+
+static int send_signal(int sig, struct siginfo *info, struct sigpending *signals)
+{
+ struct sigqueue * q = NULL;
+
+ /* Real-time signals must be queued if sent by sigqueue, or
+ some other real-time mechanism. It is implementation
+ defined whether kill() does so. We attempt to do so, on
+ the principle of least surprise, but since kill is not
+ allowed to fail with EAGAIN when low on memory we just
+ make sure at least one signal gets delivered and don't
+ pass on the info struct. */
+
+ if (atomic_read(&nr_queued_signals) < max_queued_signals) {
+ q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
+ }
+
+ if (q) {
+ atomic_inc(&nr_queued_signals);
+ q->next = NULL;
+ *signals->tail = q;
+ signals->tail = &q->next;
+ switch ((unsigned long) info) {
+ case 0:
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_USER;
+ q->info.si_pid = current->pid;
+ q->info.si_uid = current->uid;
+ break;
+ case 1:
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_KERNEL;
+ q->info.si_pid = 0;
+ q->info.si_uid = 0;
+ break;
+ default:
+ copy_siginfo(&q->info, info);
+ break;
+ }
+ } else if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+ && info->si_code != SI_USER) {
+ /*
+ * Queue overflow, abort. We may abort if the signal was rt
+ * and sent by user using something other than kill().
+ */
+ return -EAGAIN;
+ }
+
+ sigaddset(&signals->signal, sig);
+ return 0;
+}
+
+/*
+ * Tell a process that it has a new active signal..
+ *
+ * NOTE! we rely on the previous spin_lock to
+ * lock interrupts for us! We can only be called with
+ * "sigmask_lock" held, and the local interrupt must
+ * have been disabled when that got acquired!
+ *
+ * No need to set need_resched since signal event passing
+ * goes through ->blocked
+ */
+static inline void signal_wake_up(struct task_struct *t)
+{
+ t->sigpending = 1;
+
+ if (t->state & TASK_INTERRUPTIBLE) {
+ wake_up_process(t);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If the task is running on a different CPU
+ * force a reschedule on the other CPU to make
+ * it notice the new signal quickly.
+ *
+ * The code below is a tad loose and might occasionally
+ * kick the wrong CPU if we catch the process in the
+ * process of changing - but no harm is done by that
+ * other than doing an extra (lightweight) IPI interrupt.
+ */
+ spin_lock(&runqueue_lock);
+ if (t->has_cpu && t->processor != smp_processor_id())
+ smp_send_reschedule(t->processor);
+ spin_unlock(&runqueue_lock);
+#endif /* CONFIG_SMP */
+}
+
+static int deliver_signal(int sig, struct siginfo *info, struct task_struct *t)
+{
+ int retval = send_signal(sig, info, &t->pending);
+
+ if (!retval && !sigismember(&t->blocked, sig))
+ signal_wake_up(t);
+
+ return retval;
+}
+
+int
+send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+ unsigned long flags;
+ int ret;
+
+
+#if DEBUG_SIG
+printk("SIG queue (%s:%d): %d ", t->comm, t->pid, sig);
+#endif
+
+ ret = -EINVAL;
+ if (sig < 0 || sig > _NSIG)
+ goto out_nolock;
+ /* The somewhat baroque permissions check... */
+ ret = -EPERM;
+ if (bad_signal(sig, info, t))
+ goto out_nolock;
+
+ /* The null signal is a permissions and process existance probe.
+ No signal is actually delivered. Same goes for zombies. */
+ ret = 0;
+ if (!sig || !t->sig)
+ goto out_nolock;
+
+ spin_lock_irqsave(&t->sigmask_lock, flags);
+ handle_stop_signal(sig, t);
+
+ /* Optimize away the signal, if it's a signal that can be
+ handled immediately (ie non-blocked and untraced) and
+ that is ignored (either explicitly or by default). */
+
+ if (ignored_signal(sig, t))
+ goto out;
+
+ /* Support queueing exactly one non-rt signal, so that we
+ can get more detailed information about the cause of
+ the signal. */
+ if (sig < SIGRTMIN && sigismember(&t->pending.signal, sig))
+ goto out;
+
+ ret = deliver_signal(sig, info, t);
+out:
+ spin_unlock_irqrestore(&t->sigmask_lock, flags);
+ if ((t->state & TASK_INTERRUPTIBLE) && signal_pending(t))
+ wake_up_process(t);
+out_nolock:
+#if DEBUG_SIG
+printk(" %d -> %d\n", signal_pending(t), ret);
+#endif
+
+ return ret;
+}
+
+/*
+ * Force a signal that the process can't ignore: if necessary
+ * we unblock the signal and change any SIG_IGN to SIG_DFL.
+ */
+
+int
+force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+ unsigned long int flags;
+
+ spin_lock_irqsave(&t->sigmask_lock, flags);
+ if (t->sig == NULL) {
+ spin_unlock_irqrestore(&t->sigmask_lock, flags);
+ return -ESRCH;
+ }
+
+ if (t->sig->action[sig-1].sa.sa_handler == SIG_IGN)
+ t->sig->action[sig-1].sa.sa_handler = SIG_DFL;
+ sigdelset(&t->blocked, sig);
+ recalc_sigpending(t);
+ spin_unlock_irqrestore(&t->sigmask_lock, flags);
+
+ return send_sig_info(sig, info, t);
+}
+
+/*
+ * kill_pg_info() sends a signal to a process group: this is what the tty
+ * control characters do (^C, ^Z etc)
+ */
+
+int
+kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+{
+ int retval = -EINVAL;
+ if (pgrp > 0) {
+ struct task_struct *p;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (p->pgrp == pgrp) {
+ int err = send_sig_info(sig, info, p);
+ if (retval)
+ retval = err;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ }
+ return retval;
+}
+
+/*
+ * kill_sl_info() sends a signal to the session leader: this is used
+ * to send SIGHUP to the controlling process of a terminal when
+ * the connection is lost.
+ */
+
+int
+kill_sl_info(int sig, struct siginfo *info, pid_t sess)
+{
+ int retval = -EINVAL;
+ if (sess > 0) {
+ struct task_struct *p;
+
+ retval = -ESRCH;
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (p->leader && p->session == sess) {
+ int err = send_sig_info(sig, info, p);
+ if (retval)
+ retval = err;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ }
+ return retval;
+}
+
+inline int
+kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int error;
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ error = -ESRCH;
+ if (p)
+ error = send_sig_info(sig, info, p);
+ read_unlock(&tasklist_lock);
+ return error;
+}
+
+
+/*
+ * kill_something_info() interprets pid in interesting ways just like kill(2).
+ *
+ * POSIX specifies that kill(-1,sig) is unspecified, but what we have
+ * is probably wrong. Should make it like BSD or SYSV.
+ */
+
+static int kill_something_info(int sig, struct siginfo *info, int pid)
+{
+ if (!pid) {
+ return kill_pg_info(sig, info, current->pgrp);
+ } else if (pid == -1) {
+ int retval = 0, count = 0;
+ struct task_struct * p;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (p->pid > 1 && p != current) {
+ int err = send_sig_info(sig, info, p);
+ ++count;
+ if (err != -EPERM)
+ retval = err;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return count ? retval : -ESRCH;
+ } else if (pid < 0) {
+ return kill_pg_info(sig, info, -pid);
+ } else {
+ return kill_proc_info(sig, info, pid);
+ }
+}
+
+/*
+ * These are for backward compatibility with the rest of the kernel source.
+ */
+
+int
+send_sig(int sig, struct task_struct *p, int priv)
+{
+ return send_sig_info(sig, (void*)(long)(priv != 0), p);
+}
+
+void
+force_sig(int sig, struct task_struct *p)
+{
+ force_sig_info(sig, (void*)1L, p);
+}
+
+int
+kill_pg(pid_t pgrp, int sig, int priv)
+{
+ return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+}
+
+int
+kill_sl(pid_t sess, int sig, int priv)
+{
+ return kill_sl_info(sig, (void *)(long)(priv != 0), sess);
+}
+
+int
+kill_proc(pid_t pid, int sig, int priv)
+{
+ return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+}
+
+/*
+ * Joy. Or not. Pthread wants us to wake up every thread
+ * in our parent group.
+ */
+static void wake_up_parent(struct task_struct *parent)
+{
+ struct task_struct *tsk = parent;
+
+ do {
+ wake_up_interruptible(&tsk->wait_chldexit);
+ tsk = next_thread(tsk);
+ } while (tsk != parent);
+}
+
+/*
+ * Let a parent know about a status change of a child.
+ */
+
+void do_notify_parent(struct task_struct *tsk, int sig)
+{
+ struct siginfo info;
+ int why, status;
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_pid = tsk->pid;
+ info.si_uid = tsk->uid;
+
+ /* FIXME: find out whether or not this is supposed to be c*time. */
+ info.si_utime = tsk->times.tms_utime;
+ info.si_stime = tsk->times.tms_stime;
+
+ status = tsk->exit_code & 0x7f;
+ why = SI_KERNEL; /* shouldn't happen */
+ switch (tsk->state) {
+ case TASK_STOPPED:
+ /* FIXME -- can we deduce CLD_TRAPPED or CLD_CONTINUED? */
+ if (tsk->ptrace & PT_PTRACED)
+ why = CLD_TRAPPED;
+ else
+ why = CLD_STOPPED;
+ break;
+
+ default:
+ if (tsk->exit_code & 0x80)
+ why = CLD_DUMPED;
+ else if (tsk->exit_code & 0x7f)
+ why = CLD_KILLED;
+ else {
+ why = CLD_EXITED;
+ status = tsk->exit_code >> 8;
+ }
+ break;
+ }
+ info.si_code = why;
+ info.si_status = status;
+
+ send_sig_info(sig, &info, tsk->p_pptr);
+ wake_up_parent(tsk->p_pptr);
+}
+
+
+/*
+ * We need the tasklist lock because it's the only
+ * thing that protects out "parent" pointer.
+ *
+ * exit.c calls "do_notify_parent()" directly, because
+ * it already has the tasklist lock.
+ */
+void
+notify_parent(struct task_struct *tsk, int sig)
+{
+ read_lock(&tasklist_lock);
+ do_notify_parent(tsk, sig);
+ read_unlock(&tasklist_lock);
+}
+
+EXPORT_SYMBOL(dequeue_signal);
+EXPORT_SYMBOL(flush_signals);
+EXPORT_SYMBOL(force_sig);
+EXPORT_SYMBOL(force_sig_info);
+EXPORT_SYMBOL(kill_pg);
+EXPORT_SYMBOL(kill_pg_info);
+EXPORT_SYMBOL(kill_proc);
+EXPORT_SYMBOL(kill_proc_info);
+EXPORT_SYMBOL(kill_sl);
+EXPORT_SYMBOL(kill_sl_info);
+EXPORT_SYMBOL(notify_parent);
+EXPORT_SYMBOL(recalc_sigpending);
+EXPORT_SYMBOL(send_sig);
+EXPORT_SYMBOL(send_sig_info);
+EXPORT_SYMBOL(block_all_signals);
+EXPORT_SYMBOL(unblock_all_signals);
+
+
+/*
+ * System call entry points.
+ */
+
+/*
+ * We don't need to get the kernel lock - this is all local to this
+ * particular thread.. (and that's good, because this is _heavily_
+ * used by various programs)
+ */
+
+asmlinkage long
+sys_rt_sigprocmask(int how, sigset_t *set, sigset_t *oset, size_t sigsetsize)
+{
+ int error = -EINVAL;
+ sigset_t old_set, new_set;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ goto out;
+
+ if (set) {
+ error = -EFAULT;
+ if (copy_from_user(&new_set, set, sizeof(*set)))
+ goto out;
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ spin_lock_irq(&current->sigmask_lock);
+ old_set = current->blocked;
+
+ error = 0;
+ switch (how) {
+ default:
+ error = -EINVAL;
+ break;
+ case SIG_BLOCK:
+ sigorsets(&new_set, &old_set, &new_set);
+ break;
+ case SIG_UNBLOCK:
+ signandsets(&new_set, &old_set, &new_set);
+ break;
+ case SIG_SETMASK:
+ break;
+ }
+
+ current->blocked = new_set;
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+ if (error)
+ goto out;
+ if (oset)
+ goto set_old;
+ } else if (oset) {
+ spin_lock_irq(&current->sigmask_lock);
+ old_set = current->blocked;
+ spin_unlock_irq(&current->sigmask_lock);
+
+ set_old:
+ error = -EFAULT;
+ if (copy_to_user(oset, &old_set, sizeof(*oset)))
+ goto out;
+ }
+ error = 0;
+out:
+ return error;
+}
+
+long do_sigpending(void *set, unsigned long sigsetsize)
+{
+ long error = -EINVAL;
+ sigset_t pending;
+
+ if (sigsetsize > sizeof(sigset_t))
+ goto out;
+
+ spin_lock_irq(&current->sigmask_lock);
+ sigandsets(&pending, &current->blocked, &current->pending.signal);
+ spin_unlock_irq(&current->sigmask_lock);
+
+ error = -EFAULT;
+ if (!copy_to_user(set, &pending, sigsetsize))
+ error = 0;
+out:
+ return error;
+}
+
+asmlinkage long
+sys_rt_sigpending(sigset_t *set, size_t sigsetsize)
+{
+ return do_sigpending(set, sigsetsize);
+}
+
+asmlinkage long
+sys_rt_sigtimedwait(const sigset_t *uthese, siginfo_t *uinfo,
+ const struct timespec *uts, size_t sigsetsize)
+{
+ int ret, sig;
+ sigset_t these;
+ struct timespec ts;
+ siginfo_t info;
+ long timeout = 0;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&these, uthese, sizeof(these)))
+ return -EFAULT;
+
+ /*
+ * Invert the set of allowed signals to get those we
+ * want to block.
+ */
+ sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ signotset(&these);
+
+ if (uts) {
+ if (copy_from_user(&ts, uts, sizeof(ts)))
+ return -EFAULT;
+ if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
+ || ts.tv_sec < 0)
+ return -EINVAL;
+ }
+
+ spin_lock_irq(&current->sigmask_lock);
+ sig = dequeue_signal(&these, &info);
+ if (!sig) {
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ if (uts)
+ timeout = (timespec_to_jiffies(&ts)
+ + (ts.tv_sec || ts.tv_nsec));
+
+ if (timeout) {
+ /* None ready -- temporarily unblock those we're
+ * interested while we are sleeping in so that we'll
+ * be awakened when they arrive. */
+ sigset_t oldblocked = current->blocked;
+ sigandsets(&current->blocked, &current->blocked, &these);
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+
+ current->state = TASK_INTERRUPTIBLE;
+ timeout = schedule_timeout(timeout);
+
+ spin_lock_irq(&current->sigmask_lock);
+ sig = dequeue_signal(&these, &info);
+ current->blocked = oldblocked;
+ recalc_sigpending(current);
+ }
+ }
+ spin_unlock_irq(&current->sigmask_lock);
+
+ if (sig) {
+ ret = sig;
+ if (uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
+ }
+ } else {
+ ret = -EAGAIN;
+ if (timeout)
+ ret = -EINTR;
+ }
+
+ return ret;
+}
+
+asmlinkage long
+sys_kill(int pid, int sig)
+{
+ struct siginfo info;
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = current->pid;
+ info.si_uid = current->uid;
+
+ return kill_something_info(sig, &info, pid);
+}
+
+asmlinkage long
+sys_rt_sigqueueinfo(int pid, int sig, siginfo_t *uinfo)
+{
+ siginfo_t info;
+
+ if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+ return -EFAULT;
+
+ /* Not even root can pretend to send signals from the kernel.
+ Nor can they impersonate a kill(), which adds source info. */
+ if (info.si_code >= 0)
+ return -EPERM;
+ info.si_signo = sig;
+
+ /* POSIX.1b doesn't mention process groups. */
+ return kill_proc_info(sig, &info, pid);
+}
+
+int
+do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
+{
+ struct k_sigaction *k;
+
+ if (sig < 1 || sig > _NSIG ||
+ (act && (sig == SIGKILL || sig == SIGSTOP)))
+ return -EINVAL;
+
+ k = &current->sig->action[sig-1];
+
+ spin_lock(&current->sig->siglock);
+
+ if (oact)
+ *oact = *k;
+
+ if (act) {
+ *k = *act;
+ sigdelsetmask(&k->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ /*
+ * POSIX 3.3.1.3:
+ * "Setting a signal action to SIG_IGN for a signal that is
+ * pending shall cause the pending signal to be discarded,
+ * whether or not it is blocked."
+ *
+ * "Setting a signal action to SIG_DFL for a signal that is
+ * pending and whose default action is to ignore the signal
+ * (for example, SIGCHLD), shall cause the pending signal to
+ * be discarded, whether or not it is blocked"
+ *
+ * Note the silly behaviour of SIGCHLD: SIG_IGN means that the
+ * signal isn't actually ignored, but does automatic child
+ * reaping, while SIG_DFL is explicitly said by POSIX to force
+ * the signal to be ignored.
+ */
+
+ if (k->sa.sa_handler == SIG_IGN
+ || (k->sa.sa_handler == SIG_DFL
+ && (sig == SIGCONT ||
+ sig == SIGCHLD ||
+ sig == SIGWINCH))) {
+ spin_lock_irq(&current->sigmask_lock);
+ if (rm_sig_from_queue(sig, current))
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+ }
+ }
+
+ spin_unlock(&current->sig->siglock);
+ return 0;
+}
+
+int
+do_sigaltstack (const stack_t *uss, stack_t *uoss, unsigned long sp)
+{
+ stack_t oss;
+ int error;
+
+ if (uoss) {
+ oss.ss_sp = (void *) current->sas_ss_sp;
+ oss.ss_size = current->sas_ss_size;
+ oss.ss_flags = sas_ss_flags(sp);
+ }
+
+ if (uss) {
+ void *ss_sp;
+ size_t ss_size;
+ int ss_flags;
+
+ error = -EFAULT;
+ if (verify_area(VERIFY_READ, uss, sizeof(*uss))
+ || __get_user(ss_sp, &uss->ss_sp)
+ || __get_user(ss_flags, &uss->ss_flags)
+ || __get_user(ss_size, &uss->ss_size))
+ goto out;
+
+ error = -EPERM;
+ if (on_sig_stack (sp))
+ goto out;
+
+ error = -EINVAL;
+ /*
+ *
+ * Note - this code used to test ss_flags incorrectly
+ * old code may have been written using ss_flags==0
+ * to mean ss_flags==SS_ONSTACK (as this was the only
+ * way that worked) - this fix preserves that older
+ * mechanism
+ */
+ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+ goto out;
+
+ if (ss_flags == SS_DISABLE) {
+ ss_size = 0;
+ ss_sp = NULL;
+ } else {
+ error = -ENOMEM;
+ if (ss_size < MINSIGSTKSZ)
+ goto out;
+ }
+
+ current->sas_ss_sp = (unsigned long) ss_sp;
+ current->sas_ss_size = ss_size;
+ }
+
+ if (uoss) {
+ error = -EFAULT;
+ if (copy_to_user(uoss, &oss, sizeof(oss)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+asmlinkage long
+sys_sigpending(old_sigset_t *set)
+{
+ return do_sigpending(set, sizeof(*set));
+}
+
+#if !defined(__alpha__)
+/* Alpha has its own versions with special arguments. */
+
+asmlinkage long
+sys_sigprocmask(int how, old_sigset_t *set, old_sigset_t *oset)
+{
+ int error;
+ old_sigset_t old_set, new_set;
+
+ if (set) {
+ error = -EFAULT;
+ if (copy_from_user(&new_set, set, sizeof(*set)))
+ goto out;
+ new_set &= ~(sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ spin_lock_irq(&current->sigmask_lock);
+ old_set = current->blocked.sig[0];
+
+ error = 0;
+ switch (how) {
+ default:
+ error = -EINVAL;
+ break;
+ case SIG_BLOCK:
+ sigaddsetmask(&current->blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&current->blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ current->blocked.sig[0] = new_set;
+ break;
+ }
+
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+ if (error)
+ goto out;
+ if (oset)
+ goto set_old;
+ } else if (oset) {
+ old_set = current->blocked.sig[0];
+ set_old:
+ error = -EFAULT;
+ if (copy_to_user(oset, &old_set, sizeof(*oset)))
+ goto out;
+ }
+ error = 0;
+out:
+ return error;
+}
+
+#ifndef __sparc__
+asmlinkage long
+sys_rt_sigaction(int sig, const struct sigaction *act, struct sigaction *oact,
+ size_t sigsetsize)
+{
+ struct k_sigaction new_sa, old_sa;
+ int ret = -EINVAL;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ goto out;
+
+ if (act) {
+ if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+ return -EFAULT;
+ }
+
+ ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+
+ if (!ret && oact) {
+ if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+ return -EFAULT;
+ }
+out:
+ return ret;
+}
+#endif /* __sparc__ */
+#endif
+
+#if !defined(__alpha__) && !defined(__ia64__)
+/*
+ * For backwards compatibility. Functionality superseded by sigprocmask.
+ */
+asmlinkage long
+sys_sgetmask(void)
+{
+ /* SMP safe */
+ return current->blocked.sig[0];
+}
+
+asmlinkage long
+sys_ssetmask(int newmask)
+{
+ int old;
+
+ spin_lock_irq(&current->sigmask_lock);
+ old = current->blocked.sig[0];
+
+ siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
+ sigmask(SIGSTOP)));
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+
+ return old;
+}
+#endif /* !defined(__alpha__) */
+
+#if !defined(__alpha__) && !defined(__ia64__) && !defined(__mips__)
+/*
+ * For backwards compatibility. Functionality superseded by sigaction.
+ */
+asmlinkage unsigned long
+sys_signal(int sig, __sighandler_t handler)
+{
+ struct k_sigaction new_sa, old_sa;
+ int ret;
+
+ new_sa.sa.sa_handler = handler;
+ new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
+
+ ret = do_sigaction(sig, &new_sa, &old_sa);
+
+ return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
+}
+#endif /* !alpha && !__ia64__ && !defined(__mips__) */
diff --git a/kernel/softirq.c b/kernel/softirq.c
new file mode 100644
index 000000000000..fe066399dafa
--- /dev/null
+++ b/kernel/softirq.c
@@ -0,0 +1,317 @@
+/*
+ * linux/kernel/softirq.c
+ *
+ * Copyright (C) 1992 Linus Torvalds
+ *
+ * Fixed a disable_bh()/enable_bh() race (was causing a console lockup)
+ * due bh_mask_count not atomic handling. Copyright (C) 1998 Andrea Arcangeli
+ *
+ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/tqueue.h>
+
+/*
+ - No shared variables, all the data are CPU local.
+ - If a softirq needs serialization, let it serialize itself
+ by its own spinlocks.
+ - Even if softirq is serialized, only local cpu is marked for
+ execution. Hence, we get something sort of weak cpu binding.
+ Though it is still not clear, will it result in better locality
+ or will not.
+ - These softirqs are not masked by global cli() and start_bh_atomic()
+ (by clear reasons). Hence, old parts of code still using global locks
+ MUST NOT use softirqs, but insert interfacing routines acquiring
+ global locks. F.e. look at BHs implementation.
+
+ Examples:
+ - NET RX softirq. It is multithreaded and does not require
+ any global serialization.
+ - NET TX softirq. It kicks software netdevice queues, hence
+ it is logically serialized per device, but this serialization
+ is invisible to common code.
+ - Tasklets: serialized wrt itself.
+ - Bottom halves: globally serialized, grr...
+ */
+
+/* No separate irq_stat for s390, it is part of PSA */
+#if !defined(CONFIG_ARCH_S390)
+irq_cpustat_t irq_stat[NR_CPUS];
+#endif /* CONFIG_ARCH_S390 */
+
+static struct softirq_action softirq_vec[32] __cacheline_aligned;
+
+asmlinkage void do_softirq()
+{
+ int cpu = smp_processor_id();
+ __u32 active, mask;
+
+ if (in_interrupt())
+ return;
+
+ local_bh_disable();
+
+ local_irq_disable();
+ mask = softirq_mask(cpu);
+ active = softirq_active(cpu) & mask;
+
+ if (active) {
+ struct softirq_action *h;
+
+restart:
+ /* Reset active bitmask before enabling irqs */
+ softirq_active(cpu) &= ~active;
+
+ local_irq_enable();
+
+ h = softirq_vec;
+ mask &= ~active;
+
+ do {
+ if (active & 1)
+ h->action(h);
+ h++;
+ active >>= 1;
+ } while (active);
+
+ local_irq_disable();
+
+ active = softirq_active(cpu);
+ if ((active &= mask) != 0)
+ goto retry;
+ }
+
+ local_bh_enable();
+
+ /* Leave with locally disabled hard irqs. It is critical to close
+ * window for infinite recursion, while we help local bh count,
+ * it protected us. Now we are defenceless.
+ */
+ return;
+
+retry:
+ goto restart;
+}
+
+
+static spinlock_t softirq_mask_lock = SPIN_LOCK_UNLOCKED;
+
+void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+{
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&softirq_mask_lock, flags);
+ softirq_vec[nr].data = data;
+ softirq_vec[nr].action = action;
+
+ for (i=0; i<NR_CPUS; i++)
+ softirq_mask(i) |= (1<<nr);
+ spin_unlock_irqrestore(&softirq_mask_lock, flags);
+}
+
+
+/* Tasklets */
+
+struct tasklet_head tasklet_vec[NR_CPUS] __cacheline_aligned;
+
+static void tasklet_action(struct softirq_action *a)
+{
+ int cpu = smp_processor_id();
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = tasklet_vec[cpu].list;
+ tasklet_vec[cpu].list = NULL;
+ local_irq_enable();
+
+ while (list != NULL) {
+ struct tasklet_struct *t = list;
+
+ list = list->next;
+
+ if (tasklet_trylock(t)) {
+ if (atomic_read(&t->count) == 0) {
+ clear_bit(TASKLET_STATE_SCHED, &t->state);
+
+ t->func(t->data);
+ /*
+ * talklet_trylock() uses test_and_set_bit that imply
+ * an mb when it returns zero, thus we need the explicit
+ * mb only here: while closing the critical section.
+ */
+#ifdef CONFIG_SMP
+ smp_mb__before_clear_bit();
+#endif
+ tasklet_unlock(t);
+ continue;
+ }
+ tasklet_unlock(t);
+ }
+ local_irq_disable();
+ t->next = tasklet_vec[cpu].list;
+ tasklet_vec[cpu].list = t;
+ __cpu_raise_softirq(cpu, TASKLET_SOFTIRQ);
+ local_irq_enable();
+ }
+}
+
+
+
+struct tasklet_head tasklet_hi_vec[NR_CPUS] __cacheline_aligned;
+
+static void tasklet_hi_action(struct softirq_action *a)
+{
+ int cpu = smp_processor_id();
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = tasklet_hi_vec[cpu].list;
+ tasklet_hi_vec[cpu].list = NULL;
+ local_irq_enable();
+
+ while (list != NULL) {
+ struct tasklet_struct *t = list;
+
+ list = list->next;
+
+ if (tasklet_trylock(t)) {
+ if (atomic_read(&t->count) == 0) {
+ clear_bit(TASKLET_STATE_SCHED, &t->state);
+
+ t->func(t->data);
+ tasklet_unlock(t);
+ continue;
+ }
+ tasklet_unlock(t);
+ }
+ local_irq_disable();
+ t->next = tasklet_hi_vec[cpu].list;
+ tasklet_hi_vec[cpu].list = t;
+ __cpu_raise_softirq(cpu, HI_SOFTIRQ);
+ local_irq_enable();
+ }
+}
+
+
+void tasklet_init(struct tasklet_struct *t,
+ void (*func)(unsigned long), unsigned long data)
+{
+ t->func = func;
+ t->data = data;
+ t->state = 0;
+ atomic_set(&t->count, 0);
+}
+
+void tasklet_kill(struct tasklet_struct *t)
+{
+ if (in_interrupt())
+ printk("Attempt to kill tasklet from interrupt\n");
+
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+ current->state = TASK_RUNNING;
+ do {
+ current->policy |= SCHED_YIELD;
+ schedule();
+ } while (test_bit(TASKLET_STATE_SCHED, &t->state));
+ }
+ tasklet_unlock_wait(t);
+ clear_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
+
+
+/* Old style BHs */
+
+static void (*bh_base[32])(void);
+struct tasklet_struct bh_task_vec[32];
+
+/* BHs are serialized by spinlock global_bh_lock.
+
+ It is still possible to make synchronize_bh() as
+ spin_unlock_wait(&global_bh_lock). This operation is not used
+ by kernel now, so that this lock is not made private only
+ due to wait_on_irq().
+
+ It can be removed only after auditing all the BHs.
+ */
+spinlock_t global_bh_lock = SPIN_LOCK_UNLOCKED;
+
+static void bh_action(unsigned long nr)
+{
+ int cpu = smp_processor_id();
+
+ if (!spin_trylock(&global_bh_lock))
+ goto resched;
+
+ if (!hardirq_trylock(cpu))
+ goto resched_unlock;
+
+ if (bh_base[nr])
+ bh_base[nr]();
+
+ hardirq_endlock(cpu);
+ spin_unlock(&global_bh_lock);
+ return;
+
+resched_unlock:
+ spin_unlock(&global_bh_lock);
+resched:
+ mark_bh(nr);
+}
+
+void init_bh(int nr, void (*routine)(void))
+{
+ bh_base[nr] = routine;
+ mb();
+}
+
+void remove_bh(int nr)
+{
+ tasklet_kill(bh_task_vec+nr);
+ bh_base[nr] = NULL;
+}
+
+void __init softirq_init()
+{
+ int i;
+
+ for (i=0; i<32; i++)
+ tasklet_init(bh_task_vec+i, bh_action, i);
+
+ open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
+ open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+}
+
+void __run_task_queue(task_queue *list)
+{
+ struct list_head head, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tqueue_lock, flags);
+ list_add(&head, list);
+ list_del_init(list);
+ spin_unlock_irqrestore(&tqueue_lock, flags);
+
+ next = head.next;
+ while (next != &head) {
+ void (*f) (void *);
+ struct tq_struct *p;
+ void *data;
+
+ p = list_entry(next, struct tq_struct, list);
+ next = next->next;
+ f = p->routine;
+ data = p->data;
+ wmb();
+ p->sync = 0;
+ if (f)
+ f(data);
+ }
+}
diff --git a/kernel/sys.c b/kernel/sys.c
new file mode 100644
index 000000000000..38eb5dee9ce8
--- /dev/null
+++ b/kernel/sys.c
@@ -0,0 +1,1219 @@
+/*
+ * linux/kernel/sys.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/*
+ * this is where the system-wide overflow UID and GID are defined, for
+ * architectures that now have 32-bit UID/GID but didn't in the past
+ */
+
+int overflowuid = DEFAULT_OVERFLOWUID;
+int overflowgid = DEFAULT_OVERFLOWGID;
+
+/*
+ * the same as above, but for filesystems which can only store a 16-bit
+ * UID and GID. as such, this is needed on all architectures
+ */
+
+int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * at shutdown. This is used to stop any idling DMA operations
+ * and the like.
+ */
+
+static struct notifier_block *reboot_notifier_list;
+rwlock_t notifier_lock = RW_LOCK_UNLOCKED;
+
+/**
+ * notifier_chain_register - Add notifier to a notifier chain
+ * @list: Pointer to root list pointer
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a notifier chain.
+ *
+ * Currently always returns zero.
+ */
+
+int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+{
+ write_lock(&notifier_lock);
+ while(*list)
+ {
+ if(n->priority > (*list)->priority)
+ break;
+ list= &((*list)->next);
+ }
+ n->next = *list;
+ *list=n;
+ write_unlock(&notifier_lock);
+ return 0;
+}
+
+/**
+ * notifier_chain_unregister - Remove notifier from a notifier chain
+ * @nl: Pointer to root list pointer
+ * @n: New entry in notifier chain
+ *
+ * Removes a notifier from a notifier chain.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+
+int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+{
+ write_lock(&notifier_lock);
+ while((*nl)!=NULL)
+ {
+ if((*nl)==n)
+ {
+ *nl=n->next;
+ write_unlock(&notifier_lock);
+ return 0;
+ }
+ nl=&((*nl)->next);
+ }
+ write_unlock(&notifier_lock);
+ return -ENOENT;
+}
+
+/**
+ * notifier_call_chain - Call functions in a notifier chain
+ * @n: Pointer to root pointer of notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in a notifier chain in turn.
+ *
+ * If the return value of the notifier can be and'd
+ * with %NOTIFY_STOP_MASK, then notifier_call_chain
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise, the return value is the return value
+ * of the last notifier function called.
+ */
+
+int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+{
+ int ret=NOTIFY_DONE;
+ struct notifier_block *nb = *n;
+
+ while(nb)
+ {
+ ret=nb->notifier_call(nb,val,v);
+ if(ret&NOTIFY_STOP_MASK)
+ {
+ return ret;
+ }
+ nb=nb->next;
+ }
+ return ret;
+}
+
+/**
+ * register_reboot_notifier - Register function to be called at reboot time
+ * @nb: Info about notifier function to be called
+ *
+ * Registers a function with the list of functions
+ * to be called at reboot time.
+ *
+ * Currently always returns zero, as notifier_chain_register
+ * always returns zero.
+ */
+
+int register_reboot_notifier(struct notifier_block * nb)
+{
+ return notifier_chain_register(&reboot_notifier_list, nb);
+}
+
+/**
+ * unregister_reboot_notifier - Unregister previously registered reboot notifier
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered reboot
+ * notifier function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+
+int unregister_reboot_notifier(struct notifier_block * nb)
+{
+ return notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+
+asmlinkage long sys_ni_syscall(void)
+{
+ return -ENOSYS;
+}
+
+static int proc_sel(struct task_struct *p, int which, int who)
+{
+ if(p->pid)
+ {
+ switch (which) {
+ case PRIO_PROCESS:
+ if (!who && p == current)
+ return 1;
+ return(p->pid == who);
+ case PRIO_PGRP:
+ if (!who)
+ who = current->pgrp;
+ return(p->pgrp == who);
+ case PRIO_USER:
+ if (!who)
+ who = current->uid;
+ return(p->uid == who);
+ }
+ }
+ return 0;
+}
+
+asmlinkage long sys_setpriority(int which, int who, int niceval)
+{
+ struct task_struct *p;
+ int error;
+
+ if (which > 2 || which < 0)
+ return -EINVAL;
+
+ /* normalize: avoid signed division (rounding problems) */
+ error = -ESRCH;
+ if (niceval < -20)
+ niceval = -20;
+ if (niceval > 19)
+ niceval = 19;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (!proc_sel(p, which, who))
+ continue;
+ if (p->uid != current->euid &&
+ p->uid != current->uid && !capable(CAP_SYS_NICE)) {
+ error = -EPERM;
+ continue;
+ }
+ if (error == -ESRCH)
+ error = 0;
+ if (niceval < p->nice && !capable(CAP_SYS_NICE))
+ error = -EACCES;
+ else
+ p->nice = niceval;
+ }
+ read_unlock(&tasklist_lock);
+
+ return error;
+}
+
+/*
+ * Ugh. To avoid negative return values, "getpriority()" will
+ * not return the normal nice-value, but a negated value that
+ * has been offset by 20 (ie it returns 40..1 instead of -20..19)
+ * to stay compatible.
+ */
+asmlinkage long sys_getpriority(int which, int who)
+{
+ struct task_struct *p;
+ long retval = -ESRCH;
+
+ if (which > 2 || which < 0)
+ return -EINVAL;
+
+ read_lock(&tasklist_lock);
+ for_each_task (p) {
+ long niceval;
+ if (!proc_sel(p, which, who))
+ continue;
+ niceval = 20 - p->nice;
+ if (niceval > retval)
+ retval = niceval;
+ }
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void * arg)
+{
+ char buffer[256];
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ /* For safety, we require "magic" arguments. */
+ if (magic1 != LINUX_REBOOT_MAGIC1 ||
+ (magic2 != LINUX_REBOOT_MAGIC2 && magic2 != LINUX_REBOOT_MAGIC2A &&
+ magic2 != LINUX_REBOOT_MAGIC2B))
+ return -EINVAL;
+
+ lock_kernel();
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART:
+ notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+ printk(KERN_EMERG "Restarting system.\n");
+ machine_restart(NULL);
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_ON:
+ C_A_D = 1;
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_OFF:
+ C_A_D = 0;
+ break;
+
+ case LINUX_REBOOT_CMD_HALT:
+ notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
+ printk(KERN_EMERG "System halted.\n");
+ machine_halt();
+ do_exit(0);
+ break;
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
+ printk(KERN_EMERG "Power down.\n");
+ machine_power_off();
+ do_exit(0);
+ break;
+
+ case LINUX_REBOOT_CMD_RESTART2:
+ if (strncpy_from_user(&buffer[0], (char *)arg, sizeof(buffer) - 1) < 0) {
+ unlock_kernel();
+ return -EFAULT;
+ }
+ buffer[sizeof(buffer) - 1] = '\0';
+
+ notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
+ printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
+ machine_restart(buffer);
+ break;
+
+ default:
+ unlock_kernel();
+ return -EINVAL;
+ }
+ unlock_kernel();
+ return 0;
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+ if (C_A_D) {
+ notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+ machine_restart(NULL);
+ } else
+ kill_proc(1, SIGINT, 1);
+}
+
+
+/*
+ * Unprivileged users may change the real gid to the effective gid
+ * or vice versa. (BSD-style)
+ *
+ * If you set the real gid at all, or set the effective gid to a value not
+ * equal to the real gid, then the saved gid is set to the new effective gid.
+ *
+ * This makes it possible for a setgid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setregid() will be
+ * 100% compatible with BSD. A program which uses just setgid() will be
+ * 100% compatible with POSIX with saved IDs.
+ *
+ * SMP: There are not races, the GIDs are checked only by filesystem
+ * operations (as far as semantic preservation is concerned).
+ */
+asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
+{
+ int old_rgid = current->gid;
+ int old_egid = current->egid;
+
+ if (rgid != (gid_t) -1) {
+ if ((old_rgid == rgid) ||
+ (current->egid==rgid) ||
+ capable(CAP_SETGID))
+ current->gid = rgid;
+ else
+ return -EPERM;
+ }
+ if (egid != (gid_t) -1) {
+ if ((old_rgid == egid) ||
+ (current->egid == egid) ||
+ (current->sgid == egid) ||
+ capable(CAP_SETGID))
+ current->fsgid = current->egid = egid;
+ else {
+ current->gid = old_rgid;
+ return -EPERM;
+ }
+ }
+ if (rgid != (gid_t) -1 ||
+ (egid != (gid_t) -1 && egid != old_rgid))
+ current->sgid = current->egid;
+ current->fsgid = current->egid;
+ if (current->egid != old_egid)
+ current->dumpable = 0;
+ return 0;
+}
+
+/*
+ * setgid() is implemented like SysV w/ SAVED_IDS
+ *
+ * SMP: Same implicit races as above.
+ */
+asmlinkage long sys_setgid(gid_t gid)
+{
+ int old_egid = current->egid;
+
+ if (capable(CAP_SETGID))
+ current->gid = current->egid = current->sgid = current->fsgid = gid;
+ else if ((gid == current->gid) || (gid == current->sgid))
+ current->egid = current->fsgid = gid;
+ else
+ return -EPERM;
+
+ if (current->egid != old_egid)
+ current->dumpable = 0;
+ return 0;
+}
+
+/*
+ * cap_emulate_setxuid() fixes the effective / permitted capabilities of
+ * a process after a call to setuid, setreuid, or setresuid.
+ *
+ * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
+ * {r,e,s}uid != 0, the permitted and effective capabilities are
+ * cleared.
+ *
+ * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
+ * capabilities of the process are cleared.
+ *
+ * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
+ * capabilities are set to the permitted capabilities.
+ *
+ * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
+ * never happen.
+ *
+ * -astor
+ *
+ * cevans - New behaviour, Oct '99
+ * A process may, via prctl(), elect to keep its capabilities when it
+ * calls setuid() and switches away from uid==0. Both permitted and
+ * effective sets will be retained.
+ * Without this change, it was impossible for a daemon to drop only some
+ * of its privilege. The call to setuid(!=0) would drop all privileges!
+ * Keeping uid 0 is not an option because uid 0 owns too many vital
+ * files..
+ * Thanks to Olaf Kirch and Peter Benie for spotting this.
+ */
+extern inline void cap_emulate_setxuid(int old_ruid, int old_euid,
+ int old_suid)
+{
+ if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
+ (current->uid != 0 && current->euid != 0 && current->suid != 0) &&
+ !current->keep_capabilities) {
+ cap_clear(current->cap_permitted);
+ cap_clear(current->cap_effective);
+ }
+ if (old_euid == 0 && current->euid != 0) {
+ cap_clear(current->cap_effective);
+ }
+ if (old_euid != 0 && current->euid == 0) {
+ current->cap_effective = current->cap_permitted;
+ }
+}
+
+static int set_user(uid_t new_ruid)
+{
+ struct user_struct *new_user, *old_user;
+
+ /* What if a process setreuid()'s and this brings the
+ * new uid over his NPROC rlimit? We can check this now
+ * cheaply with the new uid cache, so if it matters
+ * we should be checking for it. -DaveM
+ */
+ new_user = alloc_uid(new_ruid);
+ if (!new_user)
+ return -EAGAIN;
+ old_user = current->user;
+ atomic_dec(&old_user->processes);
+ atomic_inc(&new_user->processes);
+
+ current->uid = new_ruid;
+ current->user = new_user;
+ free_uid(old_user);
+ return 0;
+}
+
+/*
+ * Unprivileged users may change the real uid to the effective uid
+ * or vice versa. (BSD-style)
+ *
+ * If you set the real uid at all, or set the effective uid to a value not
+ * equal to the real uid, then the saved uid is set to the new effective uid.
+ *
+ * This makes it possible for a setuid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setreuid() will be
+ * 100% compatible with BSD. A program which uses just setuid() will be
+ * 100% compatible with POSIX with saved IDs.
+ */
+asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
+{
+ int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+
+ new_ruid = old_ruid = current->uid;
+ new_euid = old_euid = current->euid;
+ old_suid = current->suid;
+
+ if (ruid != (uid_t) -1) {
+ new_ruid = ruid;
+ if ((old_ruid != ruid) &&
+ (current->euid != ruid) &&
+ !capable(CAP_SETUID))
+ return -EPERM;
+ }
+
+ if (euid != (uid_t) -1) {
+ new_euid = euid;
+ if ((old_ruid != euid) &&
+ (current->euid != euid) &&
+ (current->suid != euid) &&
+ !capable(CAP_SETUID))
+ return -EPERM;
+ }
+
+ if (new_ruid != old_ruid && set_user(new_ruid) < 0)
+ return -EAGAIN;
+
+ current->fsuid = current->euid = new_euid;
+ if (ruid != (uid_t) -1 ||
+ (euid != (uid_t) -1 && euid != old_ruid))
+ current->suid = current->euid;
+ current->fsuid = current->euid;
+ if (current->euid != old_euid)
+ current->dumpable = 0;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ cap_emulate_setxuid(old_ruid, old_euid, old_suid);
+ }
+
+ return 0;
+}
+
+
+
+/*
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
+ * Note that SAVED_ID's is deficient in that a setuid root program
+ * like sendmail, for example, cannot set its uid to be a normal
+ * user and then switch back, because if you're root, setuid() sets
+ * the saved uid too. If you don't like this, blame the bright people
+ * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
+ * will allow a root program to temporarily drop privileges and be able to
+ * regain them by swapping the real and effective uid.
+ */
+asmlinkage long sys_setuid(uid_t uid)
+{
+ int old_euid = current->euid;
+ int old_ruid, old_suid, new_ruid;
+
+ old_ruid = new_ruid = current->uid;
+ old_suid = current->suid;
+ if (capable(CAP_SETUID)) {
+ if (uid != old_ruid && set_user(uid) < 0)
+ return -EAGAIN;
+ current->suid = uid;
+ } else if ((uid != current->uid) && (uid != current->suid))
+ return -EPERM;
+
+ current->fsuid = current->euid = uid;
+
+ if (old_euid != uid)
+ current->dumpable = 0;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ cap_emulate_setxuid(old_ruid, old_euid, old_suid);
+ }
+
+ return 0;
+}
+
+
+/*
+ * This function implements a generic ability to update ruid, euid,
+ * and suid. This allows you to implement the 4.4 compatible seteuid().
+ */
+asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
+{
+ int old_ruid = current->uid;
+ int old_euid = current->euid;
+ int old_suid = current->suid;
+
+ if (!capable(CAP_SETUID)) {
+ if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
+ (ruid != current->euid) && (ruid != current->suid))
+ return -EPERM;
+ if ((euid != (uid_t) -1) && (euid != current->uid) &&
+ (euid != current->euid) && (euid != current->suid))
+ return -EPERM;
+ if ((suid != (uid_t) -1) && (suid != current->uid) &&
+ (suid != current->euid) && (suid != current->suid))
+ return -EPERM;
+ }
+ if (ruid != (uid_t) -1) {
+ if (ruid != current->uid && set_user(ruid) < 0)
+ return -EAGAIN;
+ }
+ if (euid != (uid_t) -1) {
+ if (euid != current->euid)
+ current->dumpable = 0;
+ current->euid = euid;
+ current->fsuid = euid;
+ }
+ if (suid != (uid_t) -1)
+ current->suid = suid;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ cap_emulate_setxuid(old_ruid, old_euid, old_suid);
+ }
+
+ return 0;
+}
+
+asmlinkage long sys_getresuid(uid_t *ruid, uid_t *euid, uid_t *suid)
+{
+ int retval;
+
+ if (!(retval = put_user(current->uid, ruid)) &&
+ !(retval = put_user(current->euid, euid)))
+ retval = put_user(current->suid, suid);
+
+ return retval;
+}
+
+/*
+ * Same as above, but for rgid, egid, sgid.
+ */
+asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
+{
+ if (!capable(CAP_SETGID)) {
+ if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
+ (rgid != current->egid) && (rgid != current->sgid))
+ return -EPERM;
+ if ((egid != (gid_t) -1) && (egid != current->gid) &&
+ (egid != current->egid) && (egid != current->sgid))
+ return -EPERM;
+ if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
+ (sgid != current->egid) && (sgid != current->sgid))
+ return -EPERM;
+ }
+ if (rgid != (gid_t) -1)
+ current->gid = rgid;
+ if (egid != (gid_t) -1) {
+ if (egid != current->egid)
+ current->dumpable = 0;
+ current->egid = egid;
+ current->fsgid = egid;
+ }
+ if (sgid != (gid_t) -1)
+ current->sgid = sgid;
+ return 0;
+}
+
+asmlinkage long sys_getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid)
+{
+ int retval;
+
+ if (!(retval = put_user(current->gid, rgid)) &&
+ !(retval = put_user(current->egid, egid)))
+ retval = put_user(current->sgid, sgid);
+
+ return retval;
+}
+
+
+/*
+ * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
+ * is used for "access()" and for the NFS daemon (letting nfsd stay at
+ * whatever uid it wants to). It normally shadows "euid", except when
+ * explicitly set by setfsuid() or for access..
+ */
+asmlinkage long sys_setfsuid(uid_t uid)
+{
+ int old_fsuid;
+
+ old_fsuid = current->fsuid;
+ if (uid == current->uid || uid == current->euid ||
+ uid == current->suid || uid == current->fsuid ||
+ capable(CAP_SETUID))
+ current->fsuid = uid;
+ if (current->fsuid != old_fsuid)
+ current->dumpable = 0;
+
+ /* We emulate fsuid by essentially doing a scaled-down version
+ * of what we did in setresuid and friends. However, we only
+ * operate on the fs-specific bits of the process' effective
+ * capabilities
+ *
+ * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
+ * if not, we might be a bit too harsh here.
+ */
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ if (old_fsuid == 0 && current->fsuid != 0) {
+ cap_t(current->cap_effective) &= ~CAP_FS_MASK;
+ }
+ if (old_fsuid != 0 && current->fsuid == 0) {
+ cap_t(current->cap_effective) |=
+ (cap_t(current->cap_permitted) & CAP_FS_MASK);
+ }
+ }
+
+ return old_fsuid;
+}
+
+/*
+ * Samma på svenska..
+ */
+asmlinkage long sys_setfsgid(gid_t gid)
+{
+ int old_fsgid;
+
+ old_fsgid = current->fsgid;
+ if (gid == current->gid || gid == current->egid ||
+ gid == current->sgid || gid == current->fsgid ||
+ capable(CAP_SETGID))
+ current->fsgid = gid;
+ if (current->fsgid != old_fsgid)
+ current->dumpable = 0;
+
+ return old_fsgid;
+}
+
+asmlinkage long sys_times(struct tms * tbuf)
+{
+ /*
+ * In the SMP world we might just be unlucky and have one of
+ * the times increment as we use it. Since the value is an
+ * atomically safe type this is just fine. Conceptually its
+ * as if the syscall took an instant longer to occur.
+ */
+ if (tbuf)
+ if (copy_to_user(tbuf, &current->times, sizeof(struct tms)))
+ return -EFAULT;
+ return jiffies;
+}
+
+/*
+ * This needs some heavy checking ...
+ * I just haven't the stomach for it. I also don't fully
+ * understand sessions/pgrp etc. Let somebody who does explain it.
+ *
+ * OK, I think I have the protection semantics right.... this is really
+ * only important on a multi-user system anyway, to make sure one user
+ * can't send a signal to a process owned by another. -TYT, 12/12/91
+ *
+ * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
+ * LBT 04.03.94
+ */
+
+asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
+{
+ struct task_struct * p;
+ int err = -EINVAL;
+
+ if (!pid)
+ pid = current->pid;
+ if (!pgid)
+ pgid = pid;
+ if (pgid < 0)
+ return -EINVAL;
+
+ /* From this point forward we keep holding onto the tasklist lock
+ * so that our parent does not change from under us. -DaveM
+ */
+ read_lock(&tasklist_lock);
+
+ err = -ESRCH;
+ p = find_task_by_pid(pid);
+ if (!p)
+ goto out;
+
+ if (p->p_pptr == current || p->p_opptr == current) {
+ err = -EPERM;
+ if (p->session != current->session)
+ goto out;
+ err = -EACCES;
+ if (p->did_exec)
+ goto out;
+ } else if (p != current)
+ goto out;
+ err = -EPERM;
+ if (p->leader)
+ goto out;
+ if (pgid != pid) {
+ struct task_struct * tmp;
+ for_each_task (tmp) {
+ if (tmp->pgrp == pgid &&
+ tmp->session == current->session)
+ goto ok_pgid;
+ }
+ goto out;
+ }
+
+ok_pgid:
+ p->pgrp = pgid;
+ err = 0;
+out:
+ /* All paths lead to here, thus we are safe. -DaveM */
+ read_unlock(&tasklist_lock);
+ return err;
+}
+
+asmlinkage long sys_getpgid(pid_t pid)
+{
+ if (!pid) {
+ return current->pgrp;
+ } else {
+ int retval;
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+
+ retval = -ESRCH;
+ if (p)
+ retval = p->pgrp;
+ read_unlock(&tasklist_lock);
+ return retval;
+ }
+}
+
+asmlinkage long sys_getpgrp(void)
+{
+ /* SMP - assuming writes are word atomic this is fine */
+ return current->pgrp;
+}
+
+asmlinkage long sys_getsid(pid_t pid)
+{
+ if (!pid) {
+ return current->session;
+ } else {
+ int retval;
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+
+ retval = -ESRCH;
+ if(p)
+ retval = p->session;
+ read_unlock(&tasklist_lock);
+ return retval;
+ }
+}
+
+asmlinkage long sys_setsid(void)
+{
+ struct task_struct * p;
+ int err = -EPERM;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if (p->pgrp == current->pid)
+ goto out;
+ }
+
+ current->leader = 1;
+ current->session = current->pgrp = current->pid;
+ current->tty = NULL;
+ current->tty_old_pgrp = 0;
+ err = current->pgrp;
+out:
+ read_unlock(&tasklist_lock);
+ return err;
+}
+
+/*
+ * Supplementary group IDs
+ */
+asmlinkage long sys_getgroups(int gidsetsize, gid_t *grouplist)
+{
+ int i;
+
+ /*
+ * SMP: Nobody else can change our grouplist. Thus we are
+ * safe.
+ */
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+ i = current->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize)
+ return -EINVAL;
+ if (copy_to_user(grouplist, current->groups, sizeof(gid_t)*i))
+ return -EFAULT;
+ }
+ return i;
+}
+
+/*
+ * SMP: Our groups are not shared. We can copy to/from them safely
+ * without another task interfering.
+ */
+
+asmlinkage long sys_setgroups(int gidsetsize, gid_t *grouplist)
+{
+ if (!capable(CAP_SETGID))
+ return -EPERM;
+ if ((unsigned) gidsetsize > NGROUPS)
+ return -EINVAL;
+ if(copy_from_user(current->groups, grouplist, gidsetsize * sizeof(gid_t)))
+ return -EFAULT;
+ current->ngroups = gidsetsize;
+ return 0;
+}
+
+static int supplemental_group_member(gid_t grp)
+{
+ int i = current->ngroups;
+
+ if (i) {
+ gid_t *groups = current->groups;
+ do {
+ if (*groups == grp)
+ return 1;
+ groups++;
+ i--;
+ } while (i);
+ }
+ return 0;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+ int retval = 1;
+ if (grp != current->fsgid)
+ retval = supplemental_group_member(grp);
+ return retval;
+}
+
+int in_egroup_p(gid_t grp)
+{
+ int retval = 1;
+ if (grp != current->egid)
+ retval = supplemental_group_member(grp);
+ return retval;
+}
+
+DECLARE_RWSEM(uts_sem);
+
+asmlinkage long sys_newuname(struct new_utsname * name)
+{
+ int errno = 0;
+
+ down_read(&uts_sem);
+ if (copy_to_user(name,&system_utsname,sizeof *name))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+}
+
+asmlinkage long sys_sethostname(char *name, int len)
+{
+ int errno;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(system_utsname.nodename, name, len)) {
+ system_utsname.nodename[len] = 0;
+ errno = 0;
+ }
+ up_write(&uts_sem);
+ return errno;
+}
+
+asmlinkage long sys_gethostname(char *name, int len)
+{
+ int i, errno;
+
+ if (len < 0)
+ return -EINVAL;
+ down_read(&uts_sem);
+ i = 1 + strlen(system_utsname.nodename);
+ if (i > len)
+ i = len;
+ errno = 0;
+ if (copy_to_user(name, system_utsname.nodename, i))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+}
+
+/*
+ * Only setdomainname; getdomainname can be implemented by calling
+ * uname()
+ */
+asmlinkage long sys_setdomainname(char *name, int len)
+{
+ int errno;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(system_utsname.domainname, name, len)) {
+ errno = 0;
+ system_utsname.domainname[len] = 0;
+ }
+ up_write(&uts_sem);
+ return errno;
+}
+
+asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit *rlim)
+{
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ else
+ return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim))
+ ? -EFAULT : 0;
+}
+
+#if !defined(__ia64__) && !defined(__s390__)
+
+/*
+ * Back compatibility for getrlimit. Needed for some apps.
+ */
+
+asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit *rlim)
+{
+ struct rlimit x;
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ memcpy(&x, current->rlim + resource, sizeof(*rlim));
+ if(x.rlim_cur > 0x7FFFFFFF)
+ x.rlim_cur = 0x7FFFFFFF;
+ if(x.rlim_max > 0x7FFFFFFF)
+ x.rlim_max = 0x7FFFFFFF;
+ return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+}
+
+#endif
+
+asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit *rlim)
+{
+ struct rlimit new_rlim, *old_rlim;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+ return -EFAULT;
+ if (new_rlim.rlim_cur < 0 || new_rlim.rlim_max < 0)
+ return -EINVAL;
+ old_rlim = current->rlim + resource;
+ if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
+ (new_rlim.rlim_max > old_rlim->rlim_max)) &&
+ !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+ if (resource == RLIMIT_NOFILE) {
+ if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
+ return -EPERM;
+ }
+ *old_rlim = new_rlim;
+ return 0;
+}
+
+/*
+ * It would make sense to put struct rusage in the task_struct,
+ * except that would make the task_struct be *really big*. After
+ * task_struct gets moved into malloc'ed memory, it would
+ * make sense to do this. It will make moving the rest of the information
+ * a lot simpler! (Which we're not doing right now because we're not
+ * measuring them yet).
+ *
+ * This is SMP safe. Either we are called from sys_getrusage on ourselves
+ * below (we know we aren't going to exit/disappear and only we change our
+ * rusage counters), or we are called from wait4() on a process which is
+ * either stopped or zombied. In the zombied case the task won't get
+ * reaped till shortly after the call to getrusage(), in both cases the
+ * task being examined is in a frozen state so the counters won't change.
+ *
+ * FIXME! Get the fault counts properly!
+ */
+int getrusage(struct task_struct *p, int who, struct rusage *ru)
+{
+ struct rusage r;
+
+ memset((char *) &r, 0, sizeof(r));
+ switch (who) {
+ case RUSAGE_SELF:
+ r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_utime);
+ r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_utime);
+ r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_stime);
+ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime);
+ r.ru_minflt = p->min_flt;
+ r.ru_majflt = p->maj_flt;
+ r.ru_nswap = p->nswap;
+ break;
+ case RUSAGE_CHILDREN:
+ r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_cutime);
+ r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_cutime);
+ r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_cstime);
+ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_cstime);
+ r.ru_minflt = p->cmin_flt;
+ r.ru_majflt = p->cmaj_flt;
+ r.ru_nswap = p->cnswap;
+ break;
+ default:
+ r.ru_utime.tv_sec = CT_TO_SECS(p->times.tms_utime + p->times.tms_cutime);
+ r.ru_utime.tv_usec = CT_TO_USECS(p->times.tms_utime + p->times.tms_cutime);
+ r.ru_stime.tv_sec = CT_TO_SECS(p->times.tms_stime + p->times.tms_cstime);
+ r.ru_stime.tv_usec = CT_TO_USECS(p->times.tms_stime + p->times.tms_cstime);
+ r.ru_minflt = p->min_flt + p->cmin_flt;
+ r.ru_majflt = p->maj_flt + p->cmaj_flt;
+ r.ru_nswap = p->nswap + p->cnswap;
+ break;
+ }
+ return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_getrusage(int who, struct rusage *ru)
+{
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
+ return -EINVAL;
+ return getrusage(current, who, ru);
+}
+
+asmlinkage long sys_umask(int mask)
+{
+ mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
+ return mask;
+}
+
+asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ int error = 0;
+ int sig;
+
+ switch (option) {
+ case PR_SET_PDEATHSIG:
+ sig = arg2;
+ if (sig > _NSIG) {
+ error = -EINVAL;
+ break;
+ }
+ current->pdeath_signal = sig;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(current->pdeath_signal, (int *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ if (current->dumpable)
+ error = 1;
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 != 0 && arg2 != 1) {
+ error = -EINVAL;
+ break;
+ }
+ current->dumpable = arg2;
+ break;
+ case PR_SET_UNALIGN:
+#ifdef SET_UNALIGN_CTL
+ error = SET_UNALIGN_CTL(current, arg2);
+#else
+ error = -EINVAL;
+#endif
+ break;
+
+ case PR_GET_UNALIGN:
+#ifdef GET_UNALIGN_CTL
+ error = GET_UNALIGN_CTL(current, arg2);
+#else
+ error = -EINVAL;
+#endif
+ break;
+
+ case PR_GET_KEEPCAPS:
+ if (current->keep_capabilities)
+ error = 1;
+ break;
+ case PR_SET_KEEPCAPS:
+ if (arg2 != 0 && arg2 != 1) {
+ error = -EINVAL;
+ break;
+ }
+ current->keep_capabilities = arg2;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ return error;
+}
+
+EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL(register_reboot_notifier);
+EXPORT_SYMBOL(unregister_reboot_notifier);
+EXPORT_SYMBOL(in_group_p);
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
new file mode 100644
index 000000000000..1c22d7838dd2
--- /dev/null
+++ b/kernel/sysctl.c
@@ -0,0 +1,1309 @@
+/*
+ * sysctl.c: General linux system control interface
+ *
+ * Begun 24 March 1995, Stephen Tweedie
+ * Added /proc support, Dec 1995
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
+ * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
+ * Dynamic registration fixes, Stephen Tweedie.
+ * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
+ * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
+ * Horn.
+ * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
+ * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
+ * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
+ * Wendling.
+ * The list_for_each() macro wasn't appropriate for the sysctl loop.
+ * Removed it and replaced it with older style, 03/23/00, Bill Wendling
+ */
+
+#include <linux/config.h>
+#include <linux/malloc.h>
+#include <linux/sysctl.h>
+#include <linux/swapctl.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/utsname.h>
+#include <linux/capability.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/sysrq.h>
+#include <linux/highuid.h>
+
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_ROOT_NFS
+#include <linux/nfs_fs.h>
+#endif
+
+#if defined(CONFIG_SYSCTL)
+
+/* External variables not in a header file. */
+extern int panic_timeout;
+extern int C_A_D;
+extern int bdf_prm[], bdflush_min[], bdflush_max[];
+extern int sysctl_overcommit_memory;
+extern int max_threads;
+extern int nr_queued_signals, max_queued_signals;
+extern int sysrq_enabled;
+
+/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+static int maxolduid = 65535;
+static int minolduid;
+
+#ifdef CONFIG_KMOD
+extern char modprobe_path[];
+#endif
+#ifdef CONFIG_HOTPLUG
+extern char hotplug_path[];
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+extern int sg_big_buff;
+#endif
+#ifdef CONFIG_SYSVIPC
+extern size_t shm_ctlmax;
+extern size_t shm_ctlall;
+extern int shm_ctlmni;
+extern int msg_ctlmax;
+extern int msg_ctlmnb;
+extern int msg_ctlmni;
+extern int sem_ctls[];
+#endif
+
+#ifdef __sparc__
+extern char reboot_command [];
+extern int stop_a_enabled;
+#endif
+#ifdef __powerpc__
+extern unsigned long htab_reclaim_on, zero_paged_on, powersave_nap;
+int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp);
+#endif
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+extern int acct_parm[];
+#endif
+
+extern int pgt_cache_water[];
+
+static int parse_table(int *, int, void *, size_t *, void *, size_t,
+ ctl_table *, void **);
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp);
+
+static ctl_table root_table[];
+static struct ctl_table_header root_table_header =
+ { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
+
+static ctl_table kern_table[];
+static ctl_table vm_table[];
+#ifdef CONFIG_NET
+extern ctl_table net_table[];
+#endif
+static ctl_table proc_table[];
+static ctl_table fs_table[];
+static ctl_table debug_table[];
+static ctl_table dev_table[];
+extern ctl_table random_table[];
+
+/* /proc declarations: */
+
+#ifdef CONFIG_PROC_FS
+
+static ssize_t proc_readsys(struct file *, char *, size_t, loff_t *);
+static ssize_t proc_writesys(struct file *, const char *, size_t, loff_t *);
+static int proc_sys_permission(struct inode *, int);
+
+struct file_operations proc_sys_file_operations = {
+ read: proc_readsys,
+ write: proc_writesys,
+};
+
+static struct inode_operations proc_sys_inode_operations = {
+ permission: proc_sys_permission,
+};
+
+extern struct proc_dir_entry *proc_sys_root;
+
+static void register_proc_table(ctl_table *, struct proc_dir_entry *);
+static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
+#endif
+
+extern int inodes_stat[];
+extern int dentry_stat[];
+
+/* The default sysctl tables: */
+
+static ctl_table root_table[] = {
+ {CTL_KERN, "kernel", NULL, 0, 0555, kern_table},
+ {CTL_VM, "vm", NULL, 0, 0555, vm_table},
+#ifdef CONFIG_NET
+ {CTL_NET, "net", NULL, 0, 0555, net_table},
+#endif
+ {CTL_PROC, "proc", NULL, 0, 0555, proc_table},
+ {CTL_FS, "fs", NULL, 0, 0555, fs_table},
+ {CTL_DEBUG, "debug", NULL, 0, 0555, debug_table},
+ {CTL_DEV, "dev", NULL, 0, 0555, dev_table},
+ {0}
+};
+
+static ctl_table kern_table[] = {
+ {KERN_OSTYPE, "ostype", system_utsname.sysname, 64,
+ 0444, NULL, &proc_doutsstring, &sysctl_string},
+ {KERN_OSRELEASE, "osrelease", system_utsname.release, 64,
+ 0444, NULL, &proc_doutsstring, &sysctl_string},
+ {KERN_VERSION, "version", system_utsname.version, 64,
+ 0444, NULL, &proc_doutsstring, &sysctl_string},
+ {KERN_NODENAME, "hostname", system_utsname.nodename, 64,
+ 0644, NULL, &proc_doutsstring, &sysctl_string},
+ {KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64,
+ 0644, NULL, &proc_doutsstring, &sysctl_string},
+ {KERN_PANIC, "panic", &panic_timeout, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_CAP_BSET, "cap-bound", &cap_bset, sizeof(kernel_cap_t),
+ 0600, NULL, &proc_dointvec_bset},
+#ifdef CONFIG_BLK_DEV_INITRD
+ {KERN_REALROOTDEV, "real-root-dev", &real_root_dev, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+#endif
+#ifdef __sparc__
+ {KERN_SPARC_REBOOT, "reboot-cmd", reboot_command,
+ 256, 0644, NULL, &proc_dostring, &sysctl_string },
+ {KERN_SPARC_STOP_A, "stop-a", &stop_a_enabled, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#endif
+#ifdef __powerpc__
+ {KERN_PPC_HTABRECLAIM, "htab-reclaim", &htab_reclaim_on, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PPC_ZEROPAGED, "zero-paged", &zero_paged_on, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PPC_POWERSAVE_NAP, "powersave-nap", &powersave_nap, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PPC_L2CR, "l2cr", NULL, 0,
+ 0644, NULL, &proc_dol2crvec},
+#endif
+ {KERN_CTLALTDEL, "ctrl-alt-del", &C_A_D, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_PRINTK, "printk", &console_loglevel, 4*sizeof(int),
+ 0644, NULL, &proc_dointvec},
+#ifdef CONFIG_KMOD
+ {KERN_MODPROBE, "modprobe", &modprobe_path, 256,
+ 0644, NULL, &proc_dostring, &sysctl_string },
+#endif
+#ifdef CONFIG_HOTPLUG
+ {KERN_HOTPLUG, "hotplug", &hotplug_path, 256,
+ 0644, NULL, &proc_dostring, &sysctl_string },
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+ {KERN_SG_BIG_BUFF, "sg-big-buff", &sg_big_buff, sizeof (int),
+ 0444, NULL, &proc_dointvec},
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+ {KERN_ACCT, "acct", &acct_parm, 3*sizeof(int),
+ 0644, NULL, &proc_dointvec},
+#endif
+ {KERN_RTSIGNR, "rtsig-nr", &nr_queued_signals, sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {KERN_RTSIGMAX, "rtsig-max", &max_queued_signals, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+#ifdef CONFIG_SYSVIPC
+ {KERN_SHMMAX, "shmmax", &shm_ctlmax, sizeof (size_t),
+ 0644, NULL, &proc_doulongvec_minmax},
+ {KERN_SHMALL, "shmall", &shm_ctlall, sizeof (size_t),
+ 0644, NULL, &proc_doulongvec_minmax},
+ {KERN_SHMMNI, "shmmni", &shm_ctlmni, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_MSGMAX, "msgmax", &msg_ctlmax, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_MSGMNI, "msgmni", &msg_ctlmni, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_MSGMNB, "msgmnb", &msg_ctlmnb, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_SEM, "sem", &sem_ctls, 4*sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#endif
+#ifdef CONFIG_MAGIC_SYSRQ
+ {KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#endif
+ {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {KERN_RANDOM, "random", NULL, 0, 0555, random_table},
+ {KERN_OVERFLOWUID, "overflowuid", &overflowuid, sizeof(int), 0644, NULL,
+ &proc_dointvec_minmax, &sysctl_intvec, NULL,
+ &minolduid, &maxolduid},
+ {KERN_OVERFLOWGID, "overflowgid", &overflowgid, sizeof(int), 0644, NULL,
+ &proc_dointvec_minmax, &sysctl_intvec, NULL,
+ &minolduid, &maxolduid},
+ {0}
+};
+
+static ctl_table vm_table[] = {
+ {VM_FREEPG, "freepages",
+ &freepages, sizeof(freepages_t), 0444, NULL, &proc_dointvec},
+ {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
+ &proc_dointvec_minmax, &sysctl_intvec, NULL,
+ &bdflush_min, &bdflush_max},
+ {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
+ sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
+ {VM_BUFFERMEM, "buffermem",
+ &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
+ {VM_PAGECACHE, "pagecache",
+ &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
+ {VM_PAGERDAEMON, "kswapd",
+ &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
+ {VM_PGT_CACHE, "pagetable_cache",
+ &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec},
+ {VM_PAGE_CLUSTER, "page-cluster",
+ &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table proc_table[] = {
+ {0}
+};
+
+static ctl_table fs_table[] = {
+ {FS_NRINODE, "inode-nr", &inodes_stat, 2*sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {FS_STATINODE, "inode-state", &inodes_stat, 7*sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {FS_NRFILE, "file-nr", &files_stat, 3*sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {FS_MAXFILE, "file-max", &files_stat.max_files, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {FS_NRSUPER, "super-nr", &nr_super_blocks, sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {FS_MAXSUPER, "super-max", &max_super_blocks, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {FS_NRDQUOT, "dquot-nr", &nr_dquots, 2*sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {FS_MAXDQUOT, "dquot-max", &max_dquots, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {FS_DENTRY, "dentry-state", &dentry_stat, 6*sizeof(int),
+ 0444, NULL, &proc_dointvec},
+ {FS_OVERFLOWUID, "overflowuid", &fs_overflowuid, sizeof(int), 0644, NULL,
+ &proc_dointvec_minmax, &sysctl_intvec, NULL,
+ &minolduid, &maxolduid},
+ {FS_OVERFLOWGID, "overflowgid", &fs_overflowgid, sizeof(int), 0644, NULL,
+ &proc_dointvec_minmax, &sysctl_intvec, NULL,
+ &minolduid, &maxolduid},
+ {FS_LEASES, "leases-enable", &leases_enable, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {FS_DIR_NOTIFY, "dir-notify-enable", &dir_notify_enable,
+ sizeof(int), 0644, NULL, &proc_dointvec},
+ {FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
+ 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table debug_table[] = {
+ {0}
+};
+
+static ctl_table dev_table[] = {
+ {0}
+};
+
+extern void init_irq_proc (void);
+
+void __init sysctl_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ register_proc_table(root_table, proc_sys_root);
+ init_irq_proc();
+#endif
+}
+
+int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen)
+{
+ struct list_head *tmp;
+
+ if (nlen == 0 || nlen >= CTL_MAXNAME)
+ return -ENOTDIR;
+ if (oldval) {
+ int old_len;
+ if (!oldlenp || get_user(old_len, oldlenp))
+ return -EFAULT;
+ }
+ tmp = &root_table_header.ctl_entry;
+ do {
+ struct ctl_table_header *head =
+ list_entry(tmp, struct ctl_table_header, ctl_entry);
+ void *context = NULL;
+ int error = parse_table(name, nlen, oldval, oldlenp,
+ newval, newlen, head->ctl_table,
+ &context);
+ if (context)
+ kfree(context);
+ if (error != -ENOTDIR)
+ return error;
+ tmp = tmp->next;
+ } while (tmp != &root_table_header.ctl_entry);
+ return -ENOTDIR;
+}
+
+extern asmlinkage long sys_sysctl(struct __sysctl_args *args)
+{
+ struct __sysctl_args tmp;
+ int error;
+
+ if (copy_from_user(&tmp, args, sizeof(tmp)))
+ return -EFAULT;
+
+ lock_kernel();
+ error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
+ tmp.newval, tmp.newlen);
+ unlock_kernel();
+ return error;
+}
+
+/*
+ * ctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+ if (!current->euid)
+ mode >>= 6;
+ else if (in_egroup_p(0))
+ mode >>= 3;
+ if ((mode & op & 0007) == op)
+ return 0;
+ return -EACCES;
+}
+
+static inline int ctl_perm(ctl_table *table, int op)
+{
+ return test_perm(table->mode, op);
+}
+
+static int parse_table(int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen,
+ ctl_table *table, void **context)
+{
+ int n;
+repeat:
+ if (!nlen)
+ return -ENOTDIR;
+ if (get_user(n, name))
+ return -EFAULT;
+ for ( ; table->ctl_name; table++) {
+ if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
+ int error;
+ if (table->child) {
+ if (ctl_perm(table, 001))
+ return -EPERM;
+ if (table->strategy) {
+ error = table->strategy(
+ table, name, nlen,
+ oldval, oldlenp,
+ newval, newlen, context);
+ if (error)
+ return error;
+ }
+ name++;
+ nlen--;
+ table = table->child;
+ goto repeat;
+ }
+ error = do_sysctl_strategy(table, name, nlen,
+ oldval, oldlenp,
+ newval, newlen, context);
+ return error;
+ }
+ }
+ return -ENOTDIR;
+}
+
+/* Perform the actual read/write of a sysctl table entry. */
+int do_sysctl_strategy (ctl_table *table,
+ int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ int op = 0, rc, len;
+
+ if (oldval)
+ op |= 004;
+ if (newval)
+ op |= 002;
+ if (ctl_perm(table, op))
+ return -EPERM;
+
+ if (table->strategy) {
+ rc = table->strategy(table, name, nlen, oldval, oldlenp,
+ newval, newlen, context);
+ if (rc < 0)
+ return rc;
+ if (rc > 0)
+ return 0;
+ }
+
+ /* If there is no strategy routine, or if the strategy returns
+ * zero, proceed with automatic r/w */
+ if (table->data && table->maxlen) {
+ if (oldval && oldlenp) {
+ get_user(len, oldlenp);
+ if (len) {
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if(copy_to_user(oldval, table->data, len))
+ return -EFAULT;
+ if(put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+ if (newval && newlen) {
+ len = newlen;
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if(copy_from_user(table->data, newval, len))
+ return -EFAULT;
+ }
+ }
+ return 0;
+}
+
+struct ctl_table_header *register_sysctl_table(ctl_table * table,
+ int insert_at_head)
+{
+ struct ctl_table_header *tmp;
+ tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
+ if (!tmp)
+ return 0;
+ tmp->ctl_table = table;
+ INIT_LIST_HEAD(&tmp->ctl_entry);
+ if (insert_at_head)
+ list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
+ else
+ list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+#ifdef CONFIG_PROC_FS
+ register_proc_table(table, proc_sys_root);
+#endif
+ return tmp;
+}
+
+/*
+ * Unlink and free a ctl_table.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+ list_del(&header->ctl_entry);
+#ifdef CONFIG_PROC_FS
+ unregister_proc_table(header->ctl_table, proc_sys_root);
+#endif
+ kfree(header);
+}
+
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_FS
+
+/* Scan the sysctl entries in table and add them all into /proc */
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
+{
+ struct proc_dir_entry *de;
+ int len;
+ mode_t mode;
+
+ for (; table->ctl_name; table++) {
+ /* Can't do anything without a proc name. */
+ if (!table->procname)
+ continue;
+ /* Maybe we can't do anything with it... */
+ if (!table->proc_handler && !table->child) {
+ printk(KERN_WARNING "SYSCTL: Can't register %s\n",
+ table->procname);
+ continue;
+ }
+
+ len = strlen(table->procname);
+ mode = table->mode;
+
+ de = NULL;
+ if (table->proc_handler)
+ mode |= S_IFREG;
+ else {
+ mode |= S_IFDIR;
+ for (de = root->subdir; de; de = de->next) {
+ if (proc_match(len, table->procname, de))
+ break;
+ }
+ /* If the subdir exists already, de is non-NULL */
+ }
+
+ if (!de) {
+ de = create_proc_entry(table->procname, mode, root);
+ if (!de)
+ continue;
+ de->data = (void *) table;
+ if (table->proc_handler) {
+ de->proc_fops = &proc_sys_file_operations;
+ de->proc_iops = &proc_sys_inode_operations;
+ }
+ }
+ table->de = de;
+ if (de->mode & S_IFDIR)
+ register_proc_table(table->child, de);
+ }
+}
+
+/*
+ * Unregister a /proc sysctl table and any subdirectories.
+ */
+static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
+{
+ struct proc_dir_entry *de;
+ for (; table->ctl_name; table++) {
+ if (!(de = table->de))
+ continue;
+ if (de->mode & S_IFDIR) {
+ if (!table->child) {
+ printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
+ continue;
+ }
+ unregister_proc_table(table->child, de);
+
+ /* Don't unregister directories which still have entries.. */
+ if (de->subdir)
+ continue;
+ }
+
+ /* Don't unregister proc entries that are still being used.. */
+ if (atomic_read(&de->count))
+ continue;
+
+ table->de = NULL;
+ remove_proc_entry(table->procname, root);
+ }
+}
+
+static ssize_t do_rw_proc(int write, struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ int op;
+ struct proc_dir_entry *de;
+ struct ctl_table *table;
+ size_t res;
+ ssize_t error;
+
+ de = (struct proc_dir_entry*) file->f_dentry->d_inode->u.generic_ip;
+ if (!de || !de->data)
+ return -ENOTDIR;
+ table = (struct ctl_table *) de->data;
+ if (!table || !table->proc_handler)
+ return -ENOTDIR;
+ op = (write ? 002 : 004);
+ if (ctl_perm(table, op))
+ return -EPERM;
+
+ res = count;
+
+ /*
+ * FIXME: we need to pass on ppos to the handler.
+ */
+
+ error = (*table->proc_handler) (table, write, file, buf, &res);
+ if (error)
+ return error;
+ return res;
+}
+
+static ssize_t proc_readsys(struct file * file, char * buf,
+ size_t count, loff_t *ppos)
+{
+ return do_rw_proc(0, file, buf, count, ppos);
+}
+
+static ssize_t proc_writesys(struct file * file, const char * buf,
+ size_t count, loff_t *ppos)
+{
+ return do_rw_proc(1, file, (char *) buf, count, ppos);
+}
+
+static int proc_sys_permission(struct inode *inode, int op)
+{
+ return test_perm(inode->i_mode, op);
+}
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ int len;
+ char *p, c;
+
+ if (!table->data || !table->maxlen || !*lenp ||
+ (filp->f_pos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ len = 0;
+ p = buffer;
+ while (len < *lenp) {
+ if(get_user(c, p++))
+ return -EFAULT;
+ if (c == 0 || c == '\n')
+ break;
+ len++;
+ }
+ if (len >= table->maxlen)
+ len = table->maxlen-1;
+ if(copy_from_user(table->data, buffer, len))
+ return -EFAULT;
+ ((char *) table->data)[len] = 0;
+ filp->f_pos += *lenp;
+ } else {
+ len = strlen(table->data);
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ if(copy_to_user(buffer, table->data, len))
+ return -EFAULT;
+ if (len < *lenp) {
+ if(put_user('\n', ((char *) buffer) + len))
+ return -EFAULT;
+ len++;
+ }
+ *lenp = len;
+ filp->f_pos += len;
+ }
+ return 0;
+}
+
+/*
+ * Special case of dostring for the UTS structure. This has locks
+ * to observe. Should this be in kernel/sys.c ????
+ */
+
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ int r;
+
+ if (!write) {
+ down_read(&uts_sem);
+ r=proc_dostring(table,0,filp,buffer,lenp);
+ up_read(&uts_sem);
+ } else {
+ down_write(&uts_sem);
+ r=proc_dostring(table,1,filp,buffer,lenp);
+ up_write(&uts_sem);
+ }
+ return r;
+}
+
+#define OP_SET 0
+#define OP_AND 1
+#define OP_OR 2
+#define OP_MAX 3
+#define OP_MIN 4
+
+static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp, int conv, int op)
+{
+ int *i, vleft, first=1, len, left, neg, val;
+ #define TMPBUFLEN 20
+ char buf[TMPBUFLEN], *p;
+
+ if (!table->data || !table->maxlen || !*lenp ||
+ (filp->f_pos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) table->data;
+ vleft = table->maxlen / sizeof(int);
+ left = *lenp;
+
+ for (; left && vleft--; i++, first=0) {
+ if (write) {
+ while (left) {
+ char c;
+ if(get_user(c,(char *) buffer))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ ((char *) buffer)++;
+ }
+ if (!left)
+ break;
+ neg = 0;
+ len = left;
+ if (len > TMPBUFLEN-1)
+ len = TMPBUFLEN-1;
+ if(copy_from_user(buf, buffer, len))
+ return -EFAULT;
+ buf[len] = 0;
+ p = buf;
+ if (*p == '-' && left > 1) {
+ neg = 1;
+ left--, p++;
+ }
+ if (*p < '0' || *p > '9')
+ break;
+ val = simple_strtoul(p, &p, 0) * conv;
+ len = p-buf;
+ if ((len < left) && *p && !isspace(*p))
+ break;
+ if (neg)
+ val = -val;
+ buffer += len;
+ left -= len;
+ switch(op) {
+ case OP_SET: *i = val; break;
+ case OP_AND: *i &= val; break;
+ case OP_OR: *i |= val; break;
+ case OP_MAX: if(*i < val)
+ *i = val;
+ break;
+ case OP_MIN: if(*i > val)
+ *i = val;
+ break;
+ }
+ } else {
+ p = buf;
+ if (!first)
+ *p++ = '\t';
+ sprintf(p, "%d", (*i) / conv);
+ len = strlen(buf);
+ if (len > left)
+ len = left;
+ if(copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ left -= len;
+ buffer += len;
+ }
+ }
+
+ if (!write && !first && left) {
+ if(put_user('\n', (char *) buffer))
+ return -EFAULT;
+ left--, buffer++;
+ }
+ if (write) {
+ p = (char *) buffer;
+ while (left) {
+ char c;
+ if(get_user(c, p++))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ }
+ }
+ if (write && first)
+ return -EINVAL;
+ *lenp -= left;
+ filp->f_pos += *lenp;
+ return 0;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET);
+}
+
+/*
+ * init may raise the set.
+ */
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ if (!capable(CAP_SYS_MODULE)) {
+ return -EPERM;
+ }
+ return do_proc_dointvec(table,write,filp,buffer,lenp,1,
+ (current->pid == 1) ? OP_SET : OP_AND);
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ int *i, *min, *max, vleft, first=1, len, left, neg, val;
+ #define TMPBUFLEN 20
+ char buf[TMPBUFLEN], *p;
+
+ if (!table->data || !table->maxlen || !*lenp ||
+ (filp->f_pos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) table->data;
+ min = (int *) table->extra1;
+ max = (int *) table->extra2;
+ vleft = table->maxlen / sizeof(int);
+ left = *lenp;
+
+ for (; left && vleft--; i++, first=0) {
+ if (write) {
+ while (left) {
+ char c;
+ if(get_user(c, (char *) buffer))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ ((char *) buffer)++;
+ }
+ if (!left)
+ break;
+ neg = 0;
+ len = left;
+ if (len > TMPBUFLEN-1)
+ len = TMPBUFLEN-1;
+ if(copy_from_user(buf, buffer, len))
+ return -EFAULT;
+ buf[len] = 0;
+ p = buf;
+ if (*p == '-' && left > 1) {
+ neg = 1;
+ left--, p++;
+ }
+ if (*p < '0' || *p > '9')
+ break;
+ val = simple_strtoul(p, &p, 0);
+ len = p-buf;
+ if ((len < left) && *p && !isspace(*p))
+ break;
+ if (neg)
+ val = -val;
+ buffer += len;
+ left -= len;
+
+ if (min && val < *min++)
+ continue;
+ if (max && val > *max++)
+ continue;
+ *i = val;
+ } else {
+ p = buf;
+ if (!first)
+ *p++ = '\t';
+ sprintf(p, "%d", *i);
+ len = strlen(buf);
+ if (len > left)
+ len = left;
+ if(copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ left -= len;
+ buffer += len;
+ }
+ }
+
+ if (!write && !first && left) {
+ if(put_user('\n', (char *) buffer))
+ return -EFAULT;
+ left--, buffer++;
+ }
+ if (write) {
+ p = (char *) buffer;
+ while (left) {
+ char c;
+ if(get_user(c, p++))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ }
+ }
+ if (write && first)
+ return -EINVAL;
+ *lenp -= left;
+ filp->f_pos += *lenp;
+ return 0;
+}
+
+/*
+ * an unsigned long function version
+ */
+
+static int do_proc_doulongvec_minmax(ctl_table *table, int write,
+ struct file *filp,
+ void *buffer, size_t *lenp,
+ unsigned long convmul,
+ unsigned long convdiv)
+{
+#define TMPBUFLEN 20
+ unsigned long *i, *min, *max, val;
+ int vleft, first=1, len, left, neg;
+ char buf[TMPBUFLEN], *p;
+
+ if (!table->data || !table->maxlen || !*lenp ||
+ (filp->f_pos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned long *) table->data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
+
+ for (; left && vleft--; i++, first=0) {
+ if (write) {
+ while (left) {
+ char c;
+ if(get_user(c, (char *) buffer))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ ((char *) buffer)++;
+ }
+ if (!left)
+ break;
+ neg = 0;
+ len = left;
+ if (len > TMPBUFLEN-1)
+ len = TMPBUFLEN-1;
+ if(copy_from_user(buf, buffer, len))
+ return -EFAULT;
+ buf[len] = 0;
+ p = buf;
+ if (*p == '-' && left > 1) {
+ neg = 1;
+ left--, p++;
+ }
+ if (*p < '0' || *p > '9')
+ break;
+ val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
+ len = p-buf;
+ if ((len < left) && *p && !isspace(*p))
+ break;
+ if (neg)
+ val = -val;
+ buffer += len;
+ left -= len;
+
+ if(neg)
+ continue;
+ if (min && val < *min++)
+ continue;
+ if (max && val > *max++)
+ continue;
+ *i = val;
+ } else {
+ p = buf;
+ if (!first)
+ *p++ = '\t';
+ sprintf(p, "%lu", convdiv * (*i) / convmul);
+ len = strlen(buf);
+ if (len > left)
+ len = left;
+ if(copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ left -= len;
+ buffer += len;
+ }
+ }
+
+ if (!write && !first && left) {
+ if(put_user('\n', (char *) buffer))
+ return -EFAULT;
+ left--, buffer++;
+ }
+ if (write) {
+ p = (char *) buffer;
+ while (left) {
+ char c;
+ if(get_user(c, p++))
+ return -EFAULT;
+ if (!isspace(c))
+ break;
+ left--;
+ }
+ }
+ if (write && first)
+ return -EINVAL;
+ *lenp -= left;
+ filp->f_pos += *lenp;
+ return 0;
+#undef TMPBUFLEN
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, 1l, 1l);
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+ struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return do_proc_doulongvec_minmax(table, write, filp, buffer,
+ lenp, HZ, 1000l);
+}
+
+
+/* Like proc_dointvec, but converts seconds to jiffies */
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return do_proc_dointvec(table,write,filp,buffer,lenp,HZ,OP_SET);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+ struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+
+#endif /* CONFIG_PROC_FS */
+
+
+/*
+ * General sysctl support routines
+ */
+
+/* The generic string strategy routine: */
+int sysctl_string(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ int l, len;
+
+ if (!table->data || !table->maxlen)
+ return -ENOTDIR;
+
+ if (oldval && oldlenp) {
+ if(get_user(len, oldlenp))
+ return -EFAULT;
+ if (len) {
+ l = strlen(table->data);
+ if (len > l) len = l;
+ if (len >= table->maxlen)
+ len = table->maxlen;
+ if(copy_to_user(oldval, table->data, len))
+ return -EFAULT;
+ if(put_user(0, ((char *) oldval) + len))
+ return -EFAULT;
+ if(put_user(len, oldlenp))
+ return -EFAULT;
+ }
+ }
+ if (newval && newlen) {
+ len = newlen;
+ if (len > table->maxlen)
+ len = table->maxlen;
+ if(copy_from_user(table->data, newval, len))
+ return -EFAULT;
+ if (len == table->maxlen)
+ len--;
+ ((char *) table->data)[len] = 0;
+ }
+ return 0;
+}
+
+/*
+ * This function makes sure that all of the integers in the vector
+ * are between the minimum and maximum values given in the arrays
+ * table->extra1 and table->extra2, respectively.
+ */
+int sysctl_intvec(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ int i, length, *vec, *min, *max;
+
+ if (newval && newlen) {
+ if (newlen % sizeof(int) != 0)
+ return -EINVAL;
+
+ if (!table->extra1 && !table->extra2)
+ return 0;
+
+ if (newlen > table->maxlen)
+ newlen = table->maxlen;
+ length = newlen / sizeof(int);
+
+ vec = (int *) newval;
+ min = (int *) table->extra1;
+ max = (int *) table->extra2;
+
+ for (i = 0; i < length; i++) {
+ int value;
+ get_user(value, vec + i);
+ if (min && value < min[i])
+ return -EINVAL;
+ if (max && value > max[i])
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Strategy function to convert jiffies to seconds */
+int sysctl_jiffies(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ if (oldval) {
+ size_t olen;
+ if (oldlenp) {
+ if (get_user(olen, oldlenp))
+ return -EFAULT;
+ if (olen!=sizeof(int))
+ return -EINVAL;
+ }
+ if (put_user(*(int *)(table->data) / HZ, (int *)oldval) ||
+ (oldlenp && put_user(sizeof(int),oldlenp)))
+ return -EFAULT;
+ }
+ if (newval && newlen) {
+ int new;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(new, (int *)newval))
+ return -EFAULT;
+ *(int *)(table->data) = new*HZ;
+ }
+ return 1;
+}
+
+
+#else /* CONFIG_SYSCTL */
+
+
+extern asmlinkage long sys_sysctl(struct __sysctl_args *args)
+{
+ return -ENOSYS;
+}
+
+int sysctl_string(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ return -ENOSYS;
+}
+
+int sysctl_intvec(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ return -ENOSYS;
+}
+
+int sysctl_jiffies(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen, void **context)
+{
+ return -ENOSYS;
+}
+
+int proc_dostring(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
+ struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ return -ENOSYS;
+}
+
+struct ctl_table_header * register_sysctl_table(ctl_table * table,
+ int insert_at_head)
+{
+ return 0;
+}
+
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/time.c b/kernel/time.c
new file mode 100644
index 000000000000..ffad77ad6203
--- /dev/null
+++ b/kernel/time.c
@@ -0,0 +1,420 @@
+/*
+ * linux/kernel/time.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file contains the interface functions for the various
+ * time related system calls: time, stime, gettimeofday, settimeofday,
+ * adjtime
+ */
+/*
+ * Modification history kernel/time.c
+ *
+ * 1993-09-02 Philip Gladstone
+ * Created file with time related functions from sched.c and adjtimex()
+ * 1993-10-08 Torsten Duwe
+ * adjtime interface update and CMOS clock write code
+ * 1995-08-13 Torsten Duwe
+ * kernel PLL updated to 1994-12-13 specs (rfc-1589)
+ * 1999-01-16 Ulrich Windl
+ * Introduced error checking for many cases in adjtimex().
+ * Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
+ * (Even though the technical memorandum forbids it)
+ */
+
+#include <linux/mm.h>
+#include <linux/timex.h>
+#include <linux/smp_lock.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * The timezone where the local system is located. Used as a default by some
+ * programs who obtain this value by using gettimeofday.
+ */
+struct timezone sys_tz;
+
+static void do_normal_gettime(struct timeval * tm)
+{
+ *tm=xtime;
+}
+
+void (*do_get_fast_time)(struct timeval *) = do_normal_gettime;
+
+/*
+ * Generic way to access 'xtime' (the current time of day).
+ * This can be changed if the platform provides a more accurate (and fast!)
+ * version.
+ */
+
+void get_fast_time(struct timeval * t)
+{
+ do_get_fast_time(t);
+}
+
+/* The xtime_lock is not only serializing the xtime read/writes but it's also
+ serializing all accesses to the global NTP variables now. */
+extern rwlock_t xtime_lock;
+
+#if !defined(__alpha__) && !defined(__ia64__)
+
+/*
+ * sys_time() can be implemented in user-level using
+ * sys_gettimeofday(). Is this for backwards compatibility? If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ *
+ * XXX This function is NOT 64-bit clean!
+ */
+asmlinkage long sys_time(int * tloc)
+{
+ int i;
+
+ /* SMP: This is fairly trivial. We grab CURRENT_TIME and
+ stuff it to user space. No side effects */
+ i = CURRENT_TIME;
+ if (tloc) {
+ if (put_user(i,tloc))
+ i = -EFAULT;
+ }
+ return i;
+}
+
+/*
+ * sys_stime() can be implemented in user-level using
+ * sys_settimeofday(). Is this for backwards compatibility? If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+
+asmlinkage long sys_stime(int * tptr)
+{
+ int value;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+ if (get_user(value, tptr))
+ return -EFAULT;
+ write_lock_irq(&xtime_lock);
+ xtime.tv_sec = value;
+ xtime.tv_usec = 0;
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
+ write_unlock_irq(&xtime_lock);
+ return 0;
+}
+
+#endif
+
+asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ if (tv) {
+ struct timeval ktv;
+ do_gettimeofday(&ktv);
+ if (copy_to_user(tv, &ktv, sizeof(ktv)))
+ return -EFAULT;
+ }
+ if (tz) {
+ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives. Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours) or
+ * compile in the timezone information into the kernel. Bad, bad....
+ *
+ * - TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+inline static void warp_clock(void)
+{
+ write_lock_irq(&xtime_lock);
+ xtime.tv_sec += sys_tz.tz_minuteswest * 60;
+ write_unlock_irq(&xtime_lock);
+}
+
+/*
+ * In case for some reason the CMOS clock has not already been running
+ * in UTC, but in some local time: The first time we set the timezone,
+ * we will warp the clock so that it is ticking UTC time instead of
+ * local time. Presumably, if someone is setting the timezone then we
+ * are running in an environment where the programs understand about
+ * timezones. This should be done at boot time in the /etc/rc script,
+ * as soon as possible, so that the clock can be set right. Otherwise,
+ * various programs will get confused when the clock gets warped.
+ */
+
+int do_sys_settimeofday(struct timeval *tv, struct timezone *tz)
+{
+ static int firsttime = 1;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (tz) {
+ /* SMP safe, global irq locking makes it work. */
+ sys_tz = *tz;
+ if (firsttime) {
+ firsttime = 0;
+ if (!tv)
+ warp_clock();
+ }
+ }
+ if (tv)
+ {
+ /* SMP safe, again the code in arch/foo/time.c should
+ * globally block out interrupts when it runs.
+ */
+ do_settimeofday(tv);
+ }
+ return 0;
+}
+
+asmlinkage long sys_settimeofday(struct timeval *tv, struct timezone *tz)
+{
+ struct timeval new_tv;
+ struct timezone new_tz;
+
+ if (tv) {
+ if (copy_from_user(&new_tv, tv, sizeof(*tv)))
+ return -EFAULT;
+ }
+ if (tz) {
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
+ return -EFAULT;
+ }
+
+ return do_sys_settimeofday(tv ? &new_tv : NULL, tz ? &new_tz : NULL);
+}
+
+long pps_offset; /* pps time offset (us) */
+long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
+
+long pps_freq; /* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
+
+long pps_valid = PPS_VALID; /* pps signal watchdog counter */
+
+int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
+
+long pps_jitcnt; /* jitter limit exceeded */
+long pps_calcnt; /* calibration intervals */
+long pps_errcnt; /* calibration errors */
+long pps_stbcnt; /* stability limit exceeded */
+
+/* hook for a loadable hardpps kernel module */
+void (*hardpps_ptr)(struct timeval *);
+
+/* adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int do_adjtimex(struct timex *txc)
+{
+ long ltemp, mtemp, save_adjust;
+ int result;
+
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ /* Now we validate the data before disabling interrupts */
+
+ if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
+ /* adjustment Offset limited to +- .512 seconds */
+ if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
+ return -EINVAL;
+
+ /* if the quartz is off by more than 10% something is VERY wrong ! */
+ if (txc->modes & ADJ_TICK)
+ if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ)
+ return -EINVAL;
+
+ write_lock_irq(&xtime_lock);
+ result = time_state; /* mostly `TIME_OK' */
+
+ /* Save for later - semantics of adjtime is to return old value */
+ save_adjust = time_adjust;
+
+#if 0 /* STA_CLOCKERR is never set yet */
+ time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */
+#endif
+ /* If there are input parameters, then process them */
+ if (txc->modes)
+ {
+ if (txc->modes & ADJ_STATUS) /* only set allowed bits */
+ time_status = (txc->status & ~STA_RONLY) |
+ (time_status & STA_RONLY);
+
+ if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */
+ if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_freq = txc->freq - pps_freq;
+ }
+
+ if (txc->modes & ADJ_MAXERROR) {
+ if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_maxerror = txc->maxerror;
+ }
+
+ if (txc->modes & ADJ_ESTERROR) {
+ if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+ result = -EINVAL;
+ goto leave;
+ }
+ time_esterror = txc->esterror;
+ }
+
+ if (txc->modes & ADJ_TIMECONST) { /* p. 24 */
+ if (txc->constant < 0) { /* NTP v4 uses values > 6 */
+ result = -EINVAL;
+ goto leave;
+ }
+ time_constant = txc->constant;
+ }
+
+ if (txc->modes & ADJ_OFFSET) { /* values checked earlier */
+ if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ }
+ else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
+ ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
+ (STA_PPSTIME | STA_PPSSIGNAL) ?
+ pps_offset : txc->offset;
+
+ /*
+ * Scale the phase adjustment and
+ * clamp to the operating range.
+ */
+ if (ltemp > MAXPHASE)
+ time_offset = MAXPHASE << SHIFT_UPDATE;
+ else if (ltemp < -MAXPHASE)
+ time_offset = -(MAXPHASE << SHIFT_UPDATE);
+ else
+ time_offset = ltemp << SHIFT_UPDATE;
+
+ /*
+ * Select whether the frequency is to be controlled
+ * and in which mode (PLL or FLL). Clamp to the operating
+ * range. Ugly multiply/divide should be replaced someday.
+ */
+
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = xtime.tv_sec;
+ mtemp = xtime.tv_sec - time_reftime;
+ time_reftime = xtime.tv_sec;
+ if (time_status & STA_FLL) {
+ if (mtemp >= MINSEC) {
+ ltemp = (time_offset / mtemp) << (SHIFT_USEC -
+ SHIFT_UPDATE);
+ if (ltemp < 0)
+ time_freq -= -ltemp >> SHIFT_KH;
+ else
+ time_freq += ltemp >> SHIFT_KH;
+ } else /* calibration interval too short (p. 12) */
+ result = TIME_ERROR;
+ } else { /* PLL mode */
+ if (mtemp < MAXSEC) {
+ ltemp *= mtemp;
+ if (ltemp < 0)
+ time_freq -= -ltemp >> (time_constant +
+ time_constant +
+ SHIFT_KF - SHIFT_USEC);
+ else
+ time_freq += ltemp >> (time_constant +
+ time_constant +
+ SHIFT_KF - SHIFT_USEC);
+ } else /* calibration interval too long (p. 12) */
+ result = TIME_ERROR;
+ }
+ if (time_freq > time_tolerance)
+ time_freq = time_tolerance;
+ else if (time_freq < -time_tolerance)
+ time_freq = -time_tolerance;
+ } /* STA_PLL || STA_PPSTIME */
+ } /* txc->modes & ADJ_OFFSET */
+ if (txc->modes & ADJ_TICK) {
+ /* if the quartz is off by more than 10% something is
+ VERY wrong ! */
+ if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ) {
+ result = -EINVAL;
+ goto leave;
+ }
+ tick = txc->tick;
+ }
+ } /* txc->modes */
+leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
+ || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
+ && (time_status & STA_PPSSIGNAL) == 0)
+ /* p. 24, (b) */
+ || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ == (STA_PPSTIME|STA_PPSJITTER))
+ /* p. 24, (c) */
+ || ((time_status & STA_PPSFREQ) != 0
+ && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
+ /* p. 24, (d) */
+ result = TIME_ERROR;
+
+ if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
+ txc->offset = save_adjust;
+ else {
+ if (time_offset < 0)
+ txc->offset = -(-time_offset >> SHIFT_UPDATE);
+ else
+ txc->offset = time_offset >> SHIFT_UPDATE;
+ }
+ txc->freq = time_freq + pps_freq;
+ txc->maxerror = time_maxerror;
+ txc->esterror = time_esterror;
+ txc->status = time_status;
+ txc->constant = time_constant;
+ txc->precision = time_precision;
+ txc->tolerance = time_tolerance;
+ txc->tick = tick;
+ txc->ppsfreq = pps_freq;
+ txc->jitter = pps_jitter >> PPS_AVG;
+ txc->shift = pps_shift;
+ txc->stabil = pps_stabil;
+ txc->jitcnt = pps_jitcnt;
+ txc->calcnt = pps_calcnt;
+ txc->errcnt = pps_errcnt;
+ txc->stbcnt = pps_stbcnt;
+ write_unlock_irq(&xtime_lock);
+ do_gettimeofday(&txc->time);
+ return(result);
+}
+
+asmlinkage long sys_adjtimex(struct timex *txc_p)
+{
+ struct timex txc; /* Local copy of parameter */
+ int ret;
+
+ /* Copy the user data space into the kernel copy
+ * structure. But bear in mind that the structures
+ * may change
+ */
+ if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
+ return -EFAULT;
+ ret = do_adjtimex(&txc);
+ return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+}
diff --git a/kernel/timer.c b/kernel/timer.c
new file mode 100644
index 000000000000..579b065f3f46
--- /dev/null
+++ b/kernel/timer.c
@@ -0,0 +1,837 @@
+/*
+ * linux/kernel/timer.c
+ *
+ * Kernel internal timers, kernel timekeeping, basic process system calls
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ *
+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
+ * 1999-03-10 Improved NTP compatibility by Ulrich Windl
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/timex.h>
+#include <linux/delay.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Timekeeping variables
+ */
+
+long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */
+
+/* The current time */
+volatile struct timeval xtime __attribute__ ((aligned (16)));
+
+/* Don't completely fail for HZ > 500. */
+int tickadj = 500/HZ ? : 1; /* microsecs */
+
+DECLARE_TASK_QUEUE(tq_timer);
+DECLARE_TASK_QUEUE(tq_immediate);
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK; /* clock synchronization status */
+int time_status = STA_UNSYNC; /* clock status bits */
+long time_offset; /* time adjustment (us) */
+long time_constant = 2; /* pll time constant */
+long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
+long time_precision = 1; /* clock precision (us) */
+long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
+long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
+long time_phase; /* phase offset (scaled us) */
+long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
+ /* frequency offset (scaled ppm)*/
+long time_adj; /* tick adjust (scaled 1 / HZ) */
+long time_reftime; /* time at last adjustment (s) */
+
+long time_adjust;
+long time_adjust_step;
+
+unsigned long event;
+
+extern int do_setitimer(int, struct itimerval *, struct itimerval *);
+
+unsigned long volatile jiffies;
+
+unsigned int * prof_buffer;
+unsigned long prof_len;
+unsigned long prof_shift;
+
+/*
+ * Event timer code
+ */
+#define TVN_BITS 6
+#define TVR_BITS 8
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
+struct timer_vec {
+ int index;
+ struct list_head vec[TVN_SIZE];
+};
+
+struct timer_vec_root {
+ int index;
+ struct list_head vec[TVR_SIZE];
+};
+
+static struct timer_vec tv5;
+static struct timer_vec tv4;
+static struct timer_vec tv3;
+static struct timer_vec tv2;
+static struct timer_vec_root tv1;
+
+static struct timer_vec * const tvecs[] = {
+ (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
+};
+
+#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
+
+void init_timervecs (void)
+{
+ int i;
+
+ for (i = 0; i < TVN_SIZE; i++) {
+ INIT_LIST_HEAD(tv5.vec + i);
+ INIT_LIST_HEAD(tv4.vec + i);
+ INIT_LIST_HEAD(tv3.vec + i);
+ INIT_LIST_HEAD(tv2.vec + i);
+ }
+ for (i = 0; i < TVR_SIZE; i++)
+ INIT_LIST_HEAD(tv1.vec + i);
+}
+
+static unsigned long timer_jiffies;
+
+static inline void internal_add_timer(struct timer_list *timer)
+{
+ /*
+ * must be cli-ed when calling this
+ */
+ unsigned long expires = timer->expires;
+ unsigned long idx = expires - timer_jiffies;
+ struct list_head * vec;
+
+ if (idx < TVR_SIZE) {
+ int i = expires & TVR_MASK;
+ vec = tv1.vec + i;
+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ int i = (expires >> TVR_BITS) & TVN_MASK;
+ vec = tv2.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+ vec = tv3.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+ vec = tv4.vec + i;
+ } else if ((signed long) idx < 0) {
+ /* can happen if you add a timer with expires == jiffies,
+ * or you set a timer to go off in the past
+ */
+ vec = tv1.vec + tv1.index;
+ } else if (idx <= 0xffffffffUL) {
+ int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+ vec = tv5.vec + i;
+ } else {
+ /* Can only get here on architectures with 64-bit jiffies */
+ INIT_LIST_HEAD(&timer->list);
+ return;
+ }
+ /*
+ * Timers are FIFO!
+ */
+ list_add(&timer->list, vec->prev);
+}
+
+/* Initialize both explicitly - let's try to have them in the same cache line */
+spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
+
+#ifdef CONFIG_SMP
+volatile struct timer_list * volatile running_timer;
+#define timer_enter(t) do { running_timer = t; mb(); } while (0)
+#define timer_exit() do { running_timer = NULL; } while (0)
+#define timer_is_running(t) (running_timer == t)
+#define timer_synchronize(t) while (timer_is_running(t)) barrier()
+#else
+#define timer_enter(t) do { } while (0)
+#define timer_exit() do { } while (0)
+#endif
+
+void add_timer(struct timer_list *timer)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ if (timer_pending(timer))
+ goto bug;
+ internal_add_timer(timer);
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ return;
+bug:
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ printk("bug: kernel timer added twice at %p.\n",
+ __builtin_return_address(0));
+}
+
+static inline int detach_timer (struct timer_list *timer)
+{
+ if (!timer_pending(timer))
+ return 0;
+ list_del(&timer->list);
+ return 1;
+}
+
+int mod_timer(struct timer_list *timer, unsigned long expires)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ timer->expires = expires;
+ ret = detach_timer(timer);
+ internal_add_timer(timer);
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ return ret;
+}
+
+int del_timer(struct timer_list * timer)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ ret = detach_timer(timer);
+ timer->list.next = timer->list.prev = NULL;
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+void sync_timers(void)
+{
+ spin_unlock_wait(&global_bh_lock);
+}
+
+/*
+ * SMP specific function to delete periodic timer.
+ * Caller must disable by some means restarting the timer
+ * for new. Upon exit the timer is not queued and handler is not running
+ * on any CPU. It returns number of times, which timer was deleted
+ * (for reference counting).
+ */
+
+int del_timer_sync(struct timer_list * timer)
+{
+ int ret = 0;
+
+ for (;;) {
+ unsigned long flags;
+ int running;
+
+ spin_lock_irqsave(&timerlist_lock, flags);
+ ret += detach_timer(timer);
+ timer->list.next = timer->list.prev = 0;
+ running = timer_is_running(timer);
+ spin_unlock_irqrestore(&timerlist_lock, flags);
+
+ if (!running)
+ break;
+
+ timer_synchronize(timer);
+ }
+
+ return ret;
+}
+#endif
+
+
+static inline void cascade_timers(struct timer_vec *tv)
+{
+ /* cascade all the timers from tv up one level */
+ struct list_head *head, *curr, *next;
+
+ head = tv->vec + tv->index;
+ curr = head->next;
+ /*
+ * We are removing _all_ timers from the list, so we don't have to
+ * detach them individually, just clear the list afterwards.
+ */
+ while (curr != head) {
+ struct timer_list *tmp;
+
+ tmp = list_entry(curr, struct timer_list, list);
+ next = curr->next;
+ list_del(curr); // not needed
+ internal_add_timer(tmp);
+ curr = next;
+ }
+ INIT_LIST_HEAD(head);
+ tv->index = (tv->index + 1) & TVN_MASK;
+}
+
+static inline void run_timer_list(void)
+{
+ spin_lock_irq(&timerlist_lock);
+ while ((long)(jiffies - timer_jiffies) >= 0) {
+ struct list_head *head, *curr;
+ if (!tv1.index) {
+ int n = 1;
+ do {
+ cascade_timers(tvecs[n]);
+ } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
+ }
+repeat:
+ head = tv1.vec + tv1.index;
+ curr = head->next;
+ if (curr != head) {
+ struct timer_list *timer;
+ void (*fn)(unsigned long);
+ unsigned long data;
+
+ timer = list_entry(curr, struct timer_list, list);
+ fn = timer->function;
+ data= timer->data;
+
+ detach_timer(timer);
+ timer->list.next = timer->list.prev = NULL;
+ timer_enter(timer);
+ spin_unlock_irq(&timerlist_lock);
+ fn(data);
+ spin_lock_irq(&timerlist_lock);
+ timer_exit();
+ goto repeat;
+ }
+ ++timer_jiffies;
+ tv1.index = (tv1.index + 1) & TVR_MASK;
+ }
+ spin_unlock_irq(&timerlist_lock);
+}
+
+spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
+
+void tqueue_bh(void)
+{
+ run_task_queue(&tq_timer);
+}
+
+void immediate_bh(void)
+{
+ run_task_queue(&tq_immediate);
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ */
+static void second_overflow(void)
+{
+ long ltemp;
+
+ /* Bump the maxerror field */
+ time_maxerror += time_tolerance >> SHIFT_USEC;
+ if ( time_maxerror > NTP_PHASE_LIMIT ) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The microtime() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic. The ugly divides should be
+ * replaced.
+ */
+ switch (time_state) {
+
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ case TIME_INS:
+ if (xtime.tv_sec % 86400 == 0) {
+ xtime.tv_sec--;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+ }
+ break;
+
+ case TIME_DEL:
+ if ((xtime.tv_sec + 1) % 86400 == 0) {
+ xtime.tv_sec++;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+ }
+ break;
+
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+
+ /*
+ * Compute the phase adjustment for the next second. In
+ * PLL mode, the offset is reduced by a fixed factor
+ * times the time constant. In FLL mode the offset is
+ * used directly. In either mode, the maximum phase
+ * adjustment for each second is clamped so as to spread
+ * the adjustment over not more than the number of
+ * seconds between updates.
+ */
+ if (time_offset < 0) {
+ ltemp = -time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+ time_offset += ltemp;
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+ } else {
+ ltemp = time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+ time_offset -= ltemp;
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+ }
+
+ /*
+ * Compute the frequency estimate and additional phase
+ * adjustment due to frequency error for the next
+ * second. When the PPS signal is engaged, gnaw on the
+ * watchdog counter and update the frequency computed by
+ * the pll and the PPS signal.
+ */
+ pps_valid++;
+ if (pps_valid == PPS_VALID) { /* PPS signal lost */
+ pps_jitter = MAXTIME;
+ pps_stabil = MAXFREQ;
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ }
+ ltemp = time_freq + pps_freq;
+ if (ltemp < 0)
+ time_adj -= -ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+ else
+ time_adj += ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if HZ == 100
+ /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
+ * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
+ */
+ if (time_adj < 0)
+ time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
+ else
+ time_adj += (time_adj >> 2) + (time_adj >> 5);
+#endif
+}
+
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+ if ( (time_adjust_step = time_adjust) != 0 ) {
+ /* We are doing an adjtime thing.
+ *
+ * Prepare time_adjust_step to be within bounds.
+ * Note that a positive time_adjust means we want the clock
+ * to run faster.
+ *
+ * Limit the amount of the step to be in the range
+ * -tickadj .. +tickadj
+ */
+ if (time_adjust > tickadj)
+ time_adjust_step = tickadj;
+ else if (time_adjust < -tickadj)
+ time_adjust_step = -tickadj;
+
+ /* Reduce by this step the amount of time left */
+ time_adjust -= time_adjust_step;
+ }
+ xtime.tv_usec += tick + time_adjust_step;
+ /*
+ * Advance the phase, once it gets to one microsecond, then
+ * advance the tick more.
+ */
+ time_phase += time_adj;
+ if (time_phase <= -FINEUSEC) {
+ long ltemp = -time_phase >> SHIFT_SCALE;
+ time_phase += ltemp << SHIFT_SCALE;
+ xtime.tv_usec -= ltemp;
+ }
+ else if (time_phase >= FINEUSEC) {
+ long ltemp = time_phase >> SHIFT_SCALE;
+ time_phase -= ltemp << SHIFT_SCALE;
+ xtime.tv_usec += ltemp;
+ }
+}
+
+/*
+ * Using a loop looks inefficient, but "ticks" is
+ * usually just one (we shouldn't be losing ticks,
+ * we're doing this this way mainly for interrupt
+ * latency reasons, not because we think we'll
+ * have lots of lost timer ticks
+ */
+static void update_wall_time(unsigned long ticks)
+{
+ do {
+ ticks--;
+ update_wall_time_one_tick();
+ } while (ticks);
+
+ if (xtime.tv_usec >= 1000000) {
+ xtime.tv_usec -= 1000000;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+}
+
+static inline void do_process_times(struct task_struct *p,
+ unsigned long user, unsigned long system)
+{
+ unsigned long psecs;
+
+ psecs = (p->times.tms_utime += user);
+ psecs += (p->times.tms_stime += system);
+ if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
+ /* Send SIGXCPU every second.. */
+ if (!(psecs % HZ))
+ send_sig(SIGXCPU, p, 1);
+ /* and SIGKILL when we go over max.. */
+ if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
+ send_sig(SIGKILL, p, 1);
+ }
+}
+
+static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
+{
+ unsigned long it_virt = p->it_virt_value;
+
+ if (it_virt) {
+ it_virt -= ticks;
+ if (!it_virt) {
+ it_virt = p->it_virt_incr;
+ send_sig(SIGVTALRM, p, 1);
+ }
+ p->it_virt_value = it_virt;
+ }
+}
+
+static inline void do_it_prof(struct task_struct *p)
+{
+ unsigned long it_prof = p->it_prof_value;
+
+ if (it_prof) {
+ if (--it_prof == 0) {
+ it_prof = p->it_prof_incr;
+ send_sig(SIGPROF, p, 1);
+ }
+ p->it_prof_value = it_prof;
+ }
+}
+
+void update_one_process(struct task_struct *p, unsigned long user,
+ unsigned long system, int cpu)
+{
+ p->per_cpu_utime[cpu] += user;
+ p->per_cpu_stime[cpu] += system;
+ do_process_times(p, user, system);
+ do_it_virt(p, user);
+ do_it_prof(p);
+}
+
+/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+ int cpu = smp_processor_id(), system = user_tick ^ 1;
+
+ update_one_process(p, user_tick, system, cpu);
+ if (p->pid) {
+ if (--p->counter <= 0) {
+ p->counter = 0;
+ p->need_resched = 1;
+ }
+ if (p->nice > 0)
+ kstat.per_cpu_nice[cpu] += user_tick;
+ else
+ kstat.per_cpu_user[cpu] += user_tick;
+ kstat.per_cpu_system[cpu] += system;
+ } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+ kstat.per_cpu_system[cpu] += system;
+}
+
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_tasks(void)
+{
+ struct task_struct *p;
+ unsigned long nr = 0;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if ((p->state == TASK_RUNNING ||
+ (p->state & TASK_UNINTERRUPTIBLE)))
+ nr += FIXED_1;
+ }
+ read_unlock(&tasklist_lock);
+ return nr;
+}
+
+/*
+ * Hmm.. Changed this, as the GNU make sources (load.c) seems to
+ * imply that avenrun[] is the standard name for this kind of thing.
+ * Nothing else seems to be standardized: the fractional size etc
+ * all seem to differ on different machines.
+ */
+unsigned long avenrun[3];
+
+static inline void calc_load(unsigned long ticks)
+{
+ unsigned long active_tasks; /* fixed-point */
+ static int count = LOAD_FREQ;
+
+ count -= ticks;
+ if (count < 0) {
+ count += LOAD_FREQ;
+ active_tasks = count_active_tasks();
+ CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+ CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+ CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+ }
+}
+
+/* jiffies at the most recent update of wall time */
+unsigned long wall_jiffies;
+
+/*
+ * This spinlock protect us from races in SMP while playing with xtime. -arca
+ */
+rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
+
+static inline void update_times(void)
+{
+ unsigned long ticks;
+
+ /*
+ * update_times() is run from the raw timer_bh handler so we
+ * just know that the irqs are locally enabled and so we don't
+ * need to save/restore the flags of the local CPU here. -arca
+ */
+ write_lock_irq(&xtime_lock);
+
+ ticks = jiffies - wall_jiffies;
+ if (ticks) {
+ wall_jiffies += ticks;
+ update_wall_time(ticks);
+ }
+ write_unlock_irq(&xtime_lock);
+ calc_load(ticks);
+}
+
+void timer_bh(void)
+{
+ update_times();
+ run_timer_list();
+}
+
+void do_timer(struct pt_regs *regs)
+{
+ (*(unsigned long *)&jiffies)++;
+#ifndef CONFIG_SMP
+ /* SMP process accounting uses the local APIC timer */
+
+ update_process_times(user_mode(regs));
+#endif
+ mark_bh(TIMER_BH);
+ if (TQ_ACTIVE(tq_timer))
+ mark_bh(TQUEUE_BH);
+}
+
+#if !defined(__alpha__) && !defined(__ia64__)
+
+/*
+ * For backwards compatibility? This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+asmlinkage unsigned long sys_alarm(unsigned int seconds)
+{
+ struct itimerval it_new, it_old;
+ unsigned int oldalarm;
+
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+ it_new.it_value.tv_sec = seconds;
+ it_new.it_value.tv_usec = 0;
+ do_setitimer(ITIMER_REAL, &it_new, &it_old);
+ oldalarm = it_old.it_value.tv_sec;
+ /* ehhh.. We can't return 0 if we have an alarm pending.. */
+ /* And we'd better return too much than too little anyway */
+ if (it_old.it_value.tv_usec)
+ oldalarm++;
+ return oldalarm;
+}
+
+#endif
+
+#ifndef __alpha__
+
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
+ * should be moved into arch/i386 instead?
+ */
+
+asmlinkage long sys_getpid(void)
+{
+ /* This is SMP safe - current->pid doesn't change */
+ return current->tgid;
+}
+
+/*
+ * This is not strictly SMP safe: p_opptr could change
+ * from under us. However, rather than getting any lock
+ * we can use an optimistic algorithm: get the parent
+ * pid, and go back and check that the parent is still
+ * the same. If it has changed (which is extremely unlikely
+ * indeed), we just try again..
+ *
+ * NOTE! This depends on the fact that even if we _do_
+ * get an old value of "parent", we can happily dereference
+ * the pointer: we just can't necessarily trust the result
+ * until we know that the parent pointer is valid.
+ *
+ * The "mb()" macro is a memory barrier - a synchronizing
+ * event. It also makes sure that gcc doesn't optimize
+ * away the necessary memory references.. The barrier doesn't
+ * have to have all that strong semantics: on x86 we don't
+ * really require a synchronizing instruction, for example.
+ * The barrier is more important for code generation than
+ * for any real memory ordering semantics (even if there is
+ * a small window for a race, using the old pointer is
+ * harmless for a while).
+ */
+asmlinkage long sys_getppid(void)
+{
+ int pid;
+ struct task_struct * me = current;
+ struct task_struct * parent;
+
+ parent = me->p_opptr;
+ for (;;) {
+ pid = parent->pid;
+#if CONFIG_SMP
+{
+ struct task_struct *old = parent;
+ mb();
+ parent = me->p_opptr;
+ if (old != parent)
+ continue;
+}
+#endif
+ break;
+ }
+ return pid;
+}
+
+asmlinkage long sys_getuid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->uid;
+}
+
+asmlinkage long sys_geteuid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->euid;
+}
+
+asmlinkage long sys_getgid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->gid;
+}
+
+asmlinkage long sys_getegid(void)
+{
+ /* Only we change this so SMP safe */
+ return current->egid;
+}
+
+#endif
+
+asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
+{
+ struct timespec t;
+ unsigned long expire;
+
+ if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
+ return -EFAULT;
+
+ if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
+ return -EINVAL;
+
+
+ if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
+ current->policy != SCHED_OTHER)
+ {
+ /*
+ * Short delay requests up to 2 ms will be handled with
+ * high precision by a busy wait for all real-time processes.
+ *
+ * Its important on SMP not to do this holding locks.
+ */
+ udelay((t.tv_nsec + 999) / 1000);
+ return 0;
+ }
+
+ expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+
+ current->state = TASK_INTERRUPTIBLE;
+ expire = schedule_timeout(expire);
+
+ if (expire) {
+ if (rmtp) {
+ jiffies_to_timespec(expire, &t);
+ if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
+ return -EFAULT;
+ }
+ return -EINTR;
+ }
+ return 0;
+}
+
diff --git a/kernel/uid16.c b/kernel/uid16.c
new file mode 100644
index 000000000000..f76e4fd706e5
--- /dev/null
+++ b/kernel/uid16.c
@@ -0,0 +1,163 @@
+/*
+ * Wrapper functions for 16bit uid back compatibility. All nicely tied
+ * together in the faint hope we can take the out in five years time.
+ */
+
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+
+#include <asm/uaccess.h>
+
+extern asmlinkage long sys_chown(const char *, uid_t,gid_t);
+extern asmlinkage long sys_lchown(const char *, uid_t,gid_t);
+extern asmlinkage long sys_fchown(unsigned int, uid_t,gid_t);
+extern asmlinkage long sys_setregid(gid_t, gid_t);
+extern asmlinkage long sys_setgid(gid_t);
+extern asmlinkage long sys_setreuid(uid_t, uid_t);
+extern asmlinkage long sys_setuid(uid_t);
+extern asmlinkage long sys_setresuid(uid_t, uid_t, uid_t);
+extern asmlinkage long sys_setresgid(gid_t, gid_t, gid_t);
+extern asmlinkage long sys_setfsuid(uid_t);
+extern asmlinkage long sys_setfsgid(gid_t);
+
+asmlinkage long sys_chown16(const char * filename, old_uid_t user, old_gid_t group)
+{
+ return sys_chown(filename, low2highuid(user), low2highgid(group));
+}
+
+asmlinkage long sys_lchown16(const char * filename, old_uid_t user, old_gid_t group)
+{
+ return sys_lchown(filename, low2highuid(user), low2highgid(group));
+}
+
+asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
+{
+ return sys_fchown(fd, low2highuid(user), low2highgid(group));
+}
+
+asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
+{
+ return sys_setregid(low2highgid(rgid), low2highgid(egid));
+}
+
+asmlinkage long sys_setgid16(old_gid_t gid)
+{
+ return sys_setgid((gid_t)gid);
+}
+
+asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
+{
+ return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+}
+
+asmlinkage long sys_setuid16(old_uid_t uid)
+{
+ return sys_setuid((uid_t)uid);
+}
+
+asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
+{
+ return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+ low2highuid(suid));
+}
+
+asmlinkage long sys_getresuid16(old_uid_t *ruid, old_uid_t *euid, old_uid_t *suid)
+{
+ int retval;
+
+ if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
+ !(retval = put_user(high2lowuid(current->euid), euid)))
+ retval = put_user(high2lowuid(current->suid), suid);
+
+ return retval;
+}
+
+asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
+{
+ return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+ low2highgid(sgid));
+}
+
+asmlinkage long sys_getresgid16(old_gid_t *rgid, old_gid_t *egid, old_gid_t *sgid)
+{
+ int retval;
+
+ if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
+ !(retval = put_user(high2lowgid(current->egid), egid)))
+ retval = put_user(high2lowgid(current->sgid), sgid);
+
+ return retval;
+}
+
+asmlinkage long sys_setfsuid16(old_uid_t uid)
+{
+ return sys_setfsuid((uid_t)uid);
+}
+
+asmlinkage long sys_setfsgid16(old_gid_t gid)
+{
+ return sys_setfsgid((gid_t)gid);
+}
+
+asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t *grouplist)
+{
+ old_gid_t groups[NGROUPS];
+ int i,j;
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+ i = current->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize)
+ return -EINVAL;
+ for(j=0;j<i;j++)
+ groups[j] = current->groups[j];
+ if (copy_to_user(grouplist, groups, sizeof(old_gid_t)*i))
+ return -EFAULT;
+ }
+ return i;
+}
+
+asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t *grouplist)
+{
+ old_gid_t groups[NGROUPS];
+ int i;
+
+ if (!capable(CAP_SETGID))
+ return -EPERM;
+ if ((unsigned) gidsetsize > NGROUPS)
+ return -EINVAL;
+ if (copy_from_user(groups, grouplist, gidsetsize * sizeof(old_gid_t)))
+ return -EFAULT;
+ for (i = 0 ; i < gidsetsize ; i++)
+ current->groups[i] = (gid_t)groups[i];
+ current->ngroups = gidsetsize;
+ return 0;
+}
+
+asmlinkage long sys_getuid16(void)
+{
+ return high2lowuid(current->uid);
+}
+
+asmlinkage long sys_geteuid16(void)
+{
+ return high2lowuid(current->euid);
+}
+
+asmlinkage long sys_getgid16(void)
+{
+ return high2lowgid(current->gid);
+}
+
+asmlinkage long sys_getegid16(void)
+{
+ return high2lowgid(current->egid);
+}
diff --git a/kernel/user.c b/kernel/user.c
new file mode 100644
index 000000000000..be99b110e745
--- /dev/null
+++ b/kernel/user.c
@@ -0,0 +1,137 @@
+/*
+ * The "user cache".
+ *
+ * (C) Copyright 1991-2000 Linus Torvalds
+ *
+ * We have a per-user structure to keep track of how many
+ * processes, files etc the user has claimed, in order to be
+ * able to have per-user limits for system resources.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * UID task count cache, to get fast user lookup in "alloc_uid"
+ * when changing user ID's (ie setuid() and friends).
+ */
+#define UIDHASH_BITS 8
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
+#define UIDHASH_MASK (UIDHASH_SZ - 1)
+#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) ^ uid) & UIDHASH_MASK)
+#define uidhashentry(uid) (uidhash_table + __uidhashfn(uid))
+
+static kmem_cache_t *uid_cachep;
+static struct user_struct *uidhash_table[UIDHASH_SZ];
+static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
+
+struct user_struct root_user = {
+ __count: ATOMIC_INIT(1),
+ processes: ATOMIC_INIT(1),
+ files: ATOMIC_INIT(0)
+};
+
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static inline void uid_hash_insert(struct user_struct *up, struct user_struct **hashent)
+{
+ struct user_struct *next = *hashent;
+
+ up->next = next;
+ if (next)
+ next->pprev = &up->next;
+ up->pprev = hashent;
+ *hashent = up;
+}
+
+static inline void uid_hash_remove(struct user_struct *up)
+{
+ struct user_struct *next = up->next;
+ struct user_struct **pprev = up->pprev;
+
+ if (next)
+ next->pprev = pprev;
+ *pprev = next;
+}
+
+static inline struct user_struct *uid_hash_find(uid_t uid, struct user_struct **hashent)
+{
+ struct user_struct *next;
+
+ next = *hashent;
+ for (;;) {
+ struct user_struct *up = next;
+ if (next) {
+ next = up->next;
+ if (up->uid != uid)
+ continue;
+ atomic_inc(&up->__count);
+ }
+ return up;
+ }
+}
+
+void free_uid(struct user_struct *up)
+{
+ if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ uid_hash_remove(up);
+ kmem_cache_free(uid_cachep, up);
+ spin_unlock(&uidhash_lock);
+ }
+}
+
+struct user_struct * alloc_uid(uid_t uid)
+{
+ struct user_struct **hashent = uidhashentry(uid);
+ struct user_struct *up;
+
+ spin_lock(&uidhash_lock);
+ up = uid_hash_find(uid, hashent);
+ spin_unlock(&uidhash_lock);
+
+ if (!up) {
+ struct user_struct *new;
+
+ new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+ if (!new)
+ return NULL;
+ new->uid = uid;
+ atomic_set(&new->__count, 1);
+ atomic_set(&new->processes, 0);
+ atomic_set(&new->files, 0);
+
+ /*
+ * Before adding this, check whether we raced
+ * on adding the same user already..
+ */
+ spin_lock(&uidhash_lock);
+ up = uid_hash_find(uid, hashent);
+ if (up) {
+ kmem_cache_free(uid_cachep, new);
+ } else {
+ uid_hash_insert(new, hashent);
+ up = new;
+ }
+ spin_unlock(&uidhash_lock);
+
+ }
+ return up;
+}
+
+
+static int __init uid_cache_init(void)
+{
+ uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+ 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if(!uid_cachep)
+ panic("Cannot create uid taskcount SLAB cache\n");
+
+ /* Insert the root user immediately - init already runs with this */
+ uid_hash_insert(&root_user, uidhashentry(0));
+ return 0;
+}
+
+module_init(uid_cache_init);