From f1577452f18e893e59bba53b3bda4fab17cc66c6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 22 Aug 2004 22:26:41 -0700
Subject: [PATCH] fix reading string module parameters in sysfs

Reading the contents of a module_param_string through sysfs currently
oopses because the param_get_charp() function cannot operate on a
kparam_string struct.  This introduces the required param_get_string.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 5f38ee74a637..711a76e6bc2f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -339,6 +339,12 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
 	return 0;
 }
 
+int param_get_string(char *buffer, struct kernel_param *kp)
+{
+	struct kparam_string *kps = kp->arg;
+	return strlcpy(buffer, kps->string, kps->maxlen);
+}
+
 EXPORT_SYMBOL(param_set_short);
 EXPORT_SYMBOL(param_get_short);
 EXPORT_SYMBOL(param_set_ushort);
@@ -360,3 +366,4 @@ EXPORT_SYMBOL(param_get_invbool);
 EXPORT_SYMBOL(param_array_set);
 EXPORT_SYMBOL(param_array_get);
 EXPORT_SYMBOL(param_set_copystring);
+EXPORT_SYMBOL(param_get_string);
-- 
cgit v1.2.3


From 7f2b65bda8e72e07ca272c7b3078b4df972a46ea Mon Sep 17 00:00:00 2001
From: Akiyama Nobuyuki <akiyama.nobuyuk@jp.fujitsu.com>
Date: Sun, 22 Aug 2004 22:26:54 -0700
Subject: [PATCH] NMI trigger switch support for debugging(updated)

I made a patch for debugging with the help of NMI trigger switch.
When kernel hangs severely, keyboard operation(e.g.Ctrl-Alt-Del)
doesn't work properly. This patch enables debugging information
to be displayed on console in this case.
I think this feature is necessary as standard functionality.
Please feel free to use this patch and let me know if you have
any comments.

Background:

When a trouble occurs in kernel, we usually begin to investigate
with following information:
 - panic >> panic message.
 - oops >> CPU registers and stack trace.
 - hang >> **NONE** no standard method established.

How it works:

Most IA32 servers have a NMI switch that fires NMI interrupt up.
The NMI interrupt can interrupt even if kernel is serious state,
for example deadlock under the interrupt disabled.
When the NMI switch is pressed after this feature is activated,
CPU registers and stack trace are displayed on console and then
panic occurs.
This feature is activated or deactivated with sysctl.

On IA32 architecture, only the following are defined as reason
of NMI interrupt:
 - memory parity error
 - I/O check error
The reason code of NMI switch is not defined, so this patch assumes
that all undefined NMI interrupts are fired by MNI switch.
However, oprofile and NMI watchdog also use undefined NMI interrupt.
Therefore this feature cannot be used at the same time with oprofile
and NMI watchdog. This feature hands NMI interrupt over to oprofile
and NMI watchdog. So, when they have been activated, this feature
doesn't work even if it is activated.

Supported architecture:

IA32

Setup:

Set up the system control parameter as follows:

# sysctl -w kernel.unknown_nmi_panic=1
kernel.unknown_nmi_panic = 1

If the NMI switch is pressed, CPU registers and stack trace will
be displayed on console and then panic occurs.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/proc.txt | 17 ++++++++++
 arch/i386/kernel/nmi.c             | 68 ++++++++++++++++++++++++++++----------
 arch/i386/kernel/traps.c           | 21 ++++++++++++
 include/linux/sysctl.h             |  1 +
 kernel/sysctl.c                    | 16 +++++++++
 5 files changed, 106 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 9d626596d5b4..186f38c13af1 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1109,6 +1109,23 @@ modprobe
 The location  where  the  modprobe  binary  is  located.  The kernel uses this
 program to load modules on demand.
 
+unknown_nmi_panic
+-----------------
+
+The value in this file affects behavior of handling NMI. When the value is
+non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel
+debugging information is displayed on console.
+
+NMI switch that most IA32 servers have fires unknown NMI up, for example.
+If a system hangs up, try pressing the NMI switch.
+
+[NOTE]
+   This function and oprofile share a NMI callback. Therefore this function
+   cannot be enabled when oprofile is activated.
+   And NMI watchdog will be disabled when the value in this file is set to
+   non-zero.
+
+
 2.4 /proc/sys/vm - The virtual memory subsystem
 -----------------------------------------------
 
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 16d91e3e138f..abcd993cbcd9 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -25,13 +25,17 @@
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/sysdev.h>
+#include <linux/sysctl.h>
 
 #include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/nmi.h>
 
+#include "mach_traps.h"
+
 unsigned int nmi_watchdog = NMI_NONE;
+extern int unknown_nmi_panic;
 static unsigned int nmi_hz = HZ;
 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
@@ -426,8 +430,6 @@ void setup_apic_nmi_watchdog (void)
 	nmi_active = 1;
 }
 
-static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
-
 /*
  * the best way to detect whether a CPU has a 'hard lockup' problem
  * is to check it's local APIC timer IRQ counts. If they are not
@@ -459,6 +461,8 @@ void touch_nmi_watchdog (void)
 		alert_counter[i] = 0;
 }
 
+extern void die_nmi(struct pt_regs *, const char *msg);
+
 void nmi_watchdog_tick (struct pt_regs * regs)
 {
 
@@ -477,21 +481,8 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz) {
-			spin_lock(&nmi_print_lock);
-			/*
-			 * We are in trouble anyway, lets at least try
-			 * to get a message out.
-			 */
-			bust_spinlocks(1);
-			printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
-			show_registers(regs);
-			printk("console shuts up ...\n");
-			console_silent();
-			spin_unlock(&nmi_print_lock);
-			bust_spinlocks(0);
-			do_exit(SIGSEGV);
-		}
+		if (alert_counter[cpu] == 5*nmi_hz)
+			die_nmi(regs, "NMI Watchdog detected LOCKUP");
 	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
@@ -518,6 +509,49 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 	}
 }
 
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+	unsigned char reason = get_nmi_reason();
+	char buf[64];
+
+	if (!(reason & 0xc0)) {
+		sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+		die_nmi(regs, buf);
+	}
+	return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ */
+int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int old_state;
+
+	old_state = unknown_nmi_panic;
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!!old_state == !!unknown_nmi_panic)
+		return 0;
+
+	if (unknown_nmi_panic) {
+		if (reserve_lapic_nmi() < 0) {
+			unknown_nmi_panic = 0;
+			return -EBUSY;
+		} else {
+			set_nmi_callback(unknown_nmi_panic_callback);
+		}
+	} else {
+		release_lapic_nmi();
+		unset_nmi_callback();
+	}
+	return 0;
+}
+
+#endif
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
 EXPORT_SYMBOL(reserve_lapic_nmi);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index a3daea0137a1..f6454084e9cf 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -496,6 +496,27 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
+static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+void die_nmi (struct pt_regs *regs, const char *msg)
+{
+	spin_lock(&nmi_print_lock);
+	/*
+	* We are in trouble anyway, lets at least try
+	* to get a message out.
+	*/
+	bust_spinlocks(1);
+	printk(msg);
+	printk(" on CPU%d, eip %08lx, registers:\n",
+		smp_processor_id(), regs->eip);
+	show_registers(regs);
+	printk("console shuts up ...\n");
+	console_silent();
+	spin_unlock(&nmi_print_lock);
+	bust_spinlocks(0);
+	do_exit(SIGSEGV);
+}
+
 static void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = get_nmi_reason();
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 70ac59e6a41f..2ae5058a051d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -133,6 +133,7 @@ enum
 	KERN_NGROUPS_MAX=63,	/* int: NGROUPS_MAX */
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
+	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3905df6f026f..2c5c58279f81 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,12 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__)
+int unknown_nmi_panic;
+extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
+				  void __user *, size_t *, loff_t *);
+#endif
+
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
@@ -620,6 +626,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__)
+	{
+		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
+		.procname       = "unknown_nmi_panic",
+		.data           = &unknown_nmi_panic,
+		.maxlen         = sizeof (int),
+		.mode           = 0644,
+		.proc_handler   = &proc_unknown_nmi_panic,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3


From e5f29853e9797a740708e0d38cecda5be168ac8a Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Sun, 22 Aug 2004 22:46:30 -0700
Subject: [PATCH] Enable all events for initramfs

Currently most driver events are not sent out when using initramfs as
driver_init() (which triggers the events) is called before init_workqueues.

This patch rearranges the init calls so that the hotplug event queue is
enabled prior to calling driver_init(), hence we're getting all hotplug
events again.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kmod.h | 1 +
 init/main.c          | 6 +++++-
 kernel/kmod.c        | 4 +---
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index a484f52445cb..588f4c6ebe29 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -35,6 +35,7 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 
 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
 extern int call_usermodehelper(char *path, char *argv[], char *envp[], int wait);
+extern void usermodehelper_init(void);
 
 #ifdef CONFIG_HOTPLUG
 extern char hotplug_path [];
diff --git a/init/main.c b/init/main.c
index e62a5694183d..46fe75c96308 100644
--- a/init/main.c
+++ b/init/main.c
@@ -31,6 +31,7 @@
 #include <linux/tty.h>
 #include <linux/gfp.h>
 #include <linux/percpu.h>
+#include <linux/kmod.h>
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
 #include <linux/workqueue.h>
@@ -605,6 +606,10 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
+	/* drivers will send hotplug events */
+	init_workqueues();
+	usermodehelper_init();
+
 	driver_init();
 
 #ifdef CONFIG_SYSCTL
@@ -614,7 +619,6 @@ static void __init do_basic_setup(void)
 	/* Networking initialization needs a process context */ 
 	sock_init();
 
-	init_workqueues();
 	do_initcalls();
 }
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 579269c38a3b..5e7c44a0cbaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -272,10 +272,8 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 }
 EXPORT_SYMBOL(call_usermodehelper);
 
-static __init int usermodehelper_init(void)
+void __init usermodehelper_init(void)
 {
 	khelper_wq = create_singlethread_workqueue("khelper");
 	BUG_ON(!khelper_wq);
-	return 0;
 }
-core_initcall(usermodehelper_init);
-- 
cgit v1.2.3


From 6cfa4c9fd86139036d1b03578235c607a1b94d0a Mon Sep 17 00:00:00 2001
From: Christian Bornträger <linux-kernel@borntraeger.net>
Date: Sun, 22 Aug 2004 22:46:53 -0700
Subject: [PATCH] remove sync() from panic

Various people have reported deadlocks and it has aways seemed a bit risky
to try to sync the filesystems at this stage anyway.

"I have seen panic failing two times lately on an SMP system. The box
 panic'ed but was running happily on the other cpus. The culprit of this
 failure is the fact, that these panics have been caused by a block device
 or a filesystem (e.g. using errors=panic). In these cases the  likelihood
 of a failure/hang of  sys_sync() is high. This is exactly what happened in
 both cases I have seen. Meanwhile the other cpus are happily continuing
 destroying data as the kernel has a severe problem but its not aware of
 that as smp_send_stop happens after sys_sync."


Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 3c1581eb65bd..b3abe97f88a6 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -59,13 +59,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
-	printk(KERN_EMERG "Kernel panic: %s\n",buf);
-	if (in_interrupt())
-		printk(KERN_EMERG "In interrupt handler - not syncing\n");
-	else if (!current->pid)
-		printk(KERN_EMERG "In idle task - not syncing\n");
-	else
-		sys_sync();
+	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 	bust_spinlocks(0);
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 346ed9c13aab9be890a8bd52d06fd6761876bb85 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Sun, 22 Aug 2004 22:47:04 -0700
Subject: [PATCH] Move cache_reap out of timer context

I'm submitting two patches associated with moving cache_reap functionality
out of timer context.  Note that these patches do not make any further
optimizations to cache_reap at this time.

The first patch adds a function similiar to schedule_delayed_work to allow
work to be scheduled on another cpu.

The second patch makes use of schedule_delayed_work_on to schedule
cache_reap to run from keventd.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/workqueue.h |  2 ++
 kernel/workqueue.c        | 20 +++++++++++++
 mm/slab.c                 | 75 ++++++++++++++++-------------------------------
 3 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 50633a827900..d37b664363b1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -63,6 +63,8 @@ extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
 
 extern int FASTCALL(schedule_work(struct work_struct *work));
 extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
+
+extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 987fbc2986d8..3f559661ee19 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -398,6 +398,26 @@ int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay
 	return queue_delayed_work(keventd_wq, work, delay);
 }
 
+int schedule_delayed_work_on(int cpu,
+			struct work_struct *work, unsigned long delay)
+{
+	int ret = 0;
+	struct timer_list *timer = &work->timer;
+
+	if (!test_and_set_bit(0, &work->pending)) {
+		BUG_ON(timer_pending(timer));
+		BUG_ON(!list_empty(&work->entry));
+		/* This stores keventd_wq for the moment, for the timer_fn */
+		work->wq_data = keventd_wq;
+		timer->expires = jiffies + delay;
+		timer->data = (unsigned long)work;
+		timer->function = delayed_work_timer_fn;
+		add_timer_on(timer, cpu);
+		ret = 1;
+	}
+	return ret;
+}
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
diff --git a/mm/slab.c b/mm/slab.c
index 34d9e5b5ebfa..74653f89ea62 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -519,11 +519,11 @@ enum {
 	FULL
 } g_cpucache_up;
 
-static DEFINE_PER_CPU(struct timer_list, reap_timers);
+static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void reap_timer_fnc(unsigned long data);
 static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 static void enable_cpucache (kmem_cache_t *cachep);
+static void cache_reap (void *unused);
 
 static inline void ** ac_entry(struct array_cache *ac)
 {
@@ -573,35 +573,26 @@ static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 }
 
 /*
- * Start the reap timer running on the target CPU.  We run at around 1 to 2Hz.
- * Add the CPU number into the expiry time to minimize the possibility of the
- * CPUs getting into lockstep and contending for the global cache chain lock.
+ * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
+ * via the workqueue/eventd.
+ * Add the CPU number into the expiration time to minimize the possibility of
+ * the CPUs getting into lockstep and contending for the global cache chain
+ * lock.
  */
 static void __devinit start_cpu_timer(int cpu)
 {
-	struct timer_list *rt = &per_cpu(reap_timers, cpu);
+	struct work_struct *reap_work = &per_cpu(reap_work, cpu);
 
-	if (rt->function == NULL) {
-		init_timer(rt);
-		rt->expires = jiffies + HZ + 3*cpu;
-		rt->data = cpu;
-		rt->function = reap_timer_fnc;
-		add_timer_on(rt, cpu);
-	}
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void stop_cpu_timer(int cpu)
-{
-	struct timer_list *rt = &per_cpu(reap_timers, cpu);
-
-	if (rt->function) {
-		del_timer_sync(rt);
-		WARN_ON(timer_pending(rt));
-		rt->function = NULL;
+	/*
+	 * When this gets called from do_initcalls via cpucache_init(),
+	 * init_workqueues() has already run, so keventd will be setup
+	 * at that time.
+	 */
+	if (keventd_up() && reap_work->func == NULL) {
+		INIT_WORK(reap_work, cache_reap, NULL);
+		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 	}
 }
-#endif
 
 static struct array_cache *alloc_arraycache(int cpu, int entries, int batchcount)
 {
@@ -654,7 +645,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
-		stop_cpu_timer(cpu);
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		down(&cache_chain_sem);
@@ -2674,24 +2664,23 @@ static void drain_array_locked(kmem_cache_t *cachep,
 /**
  * cache_reap - Reclaim memory from caches.
  *
- * Called from a timer, every few seconds
+ * Called from workqueue/eventd every few seconds.
  * Purpose:
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
  * If we cannot acquire the cache chain semaphore then just give up - we'll
- * try again next timer interrupt.
+ * try again on the next iteration.
  */
-static void cache_reap (void)
+static void cache_reap(void *unused)
 {
 	struct list_head *walk;
 
-#if DEBUG
-	BUG_ON(!in_interrupt());
-	BUG_ON(in_irq());
-#endif
-	if (down_trylock(&cache_chain_sem))
+	if (down_trylock(&cache_chain_sem)) {
+		/* Give up. Setup the next iteration. */
+		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
 		return;
+	}
 
 	list_for_each(walk, &cache_chain) {
 		kmem_cache_t *searchp;
@@ -2755,22 +2744,8 @@ next:
 	}
 	check_irq_on();
 	up(&cache_chain_sem);
-}
-
-/*
- * This is a timer handler.  There is one per CPU.  It is called periodially
- * to shrink this CPU's caches.  Otherwise there could be memory tied up
- * for long periods (or for ever) due to load changes.
- */
-static void reap_timer_fnc(unsigned long cpu)
-{
-	struct timer_list *rt = &__get_cpu_var(reap_timers);
-
-	/* CPU hotplug can drag us off cpu: don't run on wrong CPU */
-	if (!cpu_is_offline(cpu)) {
-		cache_reap();
-		mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu);
-	}
+	/* Setup the next iteration */
+	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
 }
 
 #ifdef CONFIG_PROC_FS
-- 
cgit v1.2.3


From b9cbc585eda36c84edbf05c0e83e3bf950ff45fa Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 22 Aug 2004 22:47:16 -0700
Subject: [PATCH] gettimeofday nanoseconds patch

This issue was discussed on lkml and linux-ia64.  The patch introduces
"getnstimeofday" and removes all the code scaling gettimeofday to
nanoseoncs.  It makes it possible for the posix-timer functions to return
higher accuracy.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h  |  1 +
 kernel/posix-timers.c | 17 ++---------------
 kernel/time.c         | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/timer.c        |  3 +--
 4 files changed, 42 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index d24a690cbd04..de41e12bbbff 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -348,6 +348,7 @@ extern long do_utimes(char __user * filename, struct timeval * times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
+extern void getnstimeofday (struct timespec *tv);
 
 static inline void
 set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 42c24868837c..f2d93f698e4f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1168,15 +1168,10 @@ void exit_itimers(struct signal_struct *sig)
  */
 static int do_posix_gettime(struct k_clock *clock, struct timespec *tp)
 {
-	struct timeval tv;
-
 	if (clock->clock_get)
 		return clock->clock_get(tp);
 
-	do_gettimeofday(&tv);
-	tp->tv_sec = tv.tv_sec;
-	tp->tv_nsec = tv.tv_usec * NSEC_PER_USEC;
-
+	getnstimeofday(tp);
 	return 0;
 }
 
@@ -1192,24 +1187,16 @@ static u64 do_posix_clock_monotonic_gettime_parts(
 	struct timespec *tp, struct timespec *mo)
 {
 	u64 jiff;
-	struct timeval tpv;
 	unsigned int seq;
 
 	do {
 		seq = read_seqbegin(&xtime_lock);
-		do_gettimeofday(&tpv);
+		getnstimeofday(tp);
 		*mo = wall_to_monotonic;
 		jiff = jiffies_64;
 
 	} while(read_seqretry(&xtime_lock, seq));
 
-	/*
-	 * Love to get this before it is converted to usec.
-	 * It would save a div AND a mpy.
-	 */
-	tp->tv_sec = tpv.tv_sec;
-	tp->tv_nsec = tpv.tv_usec * NSEC_PER_USEC;
-
 	return jiff;
 }
 
diff --git a/kernel/time.c b/kernel/time.c
index 68c22f2bf452..b0bdef2b7f20 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -22,6 +22,9 @@
  *	"A Kernel Model for Precision Timekeeping" by Dave Mills
  *	Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
  *	(Even though the technical memorandum forbids it)
+ * 2004-07-14	 Christoph Lameter
+ *	Added getnstimeofday to allow the posix timer functions to return
+ *	with nanosecond accuracy
  */
 
 #include <linux/module.h>
@@ -421,6 +424,41 @@ struct timespec current_kernel_time(void)
 
 EXPORT_SYMBOL(current_kernel_time);
 
+#ifdef CONFIG_TIME_INTERPOLATION
+void getnstimeofday (struct timespec *tv)
+{
+	unsigned long seq,sec,nsec;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		sec = xtime.tv_sec;
+		nsec = xtime.tv_nsec+time_interpolator_get_offset();
+	} while (unlikely(read_seqretry(&xtime_lock, seq)));
+
+	while (unlikely(nsec >= NSEC_PER_SEC)) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	tv->tv_sec = sec;
+	tv->tv_nsec = nsec;
+}
+#else
+/*
+ * Simulate gettimeofday using do_gettimeofday which only allows a timeval
+ * and therefore only yields usec accuracy
+ */
+void getnstimeofday(struct timespec *tv)
+{
+	struct timeval x;
+
+	do_gettimeofday(&x);
+	tv->tv_sec = x.tv_sec;
+	tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
+}
+#endif
+
+EXPORT_SYMBOL(getnstimeofday);
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4850abbeacdc..79db45d06bac 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1241,8 +1241,7 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 		 * too.
 		 */
 
-		do_gettimeofday((struct timeval *)&tp);
-		tp.tv_nsec *= NSEC_PER_USEC;
+		getnstimeofday(&tp);
 		tp.tv_sec += wall_to_monotonic.tv_sec;
 		tp.tv_nsec += wall_to_monotonic.tv_nsec;
 		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-- 
cgit v1.2.3


From 683b229286b429244f35726b3c18caec429233bd Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 22 Aug 2004 22:56:21 -0700
Subject: [PATCH] vprintk support

Add vprintk call.  This lets us directly pass varargs stuff to the console
without using vsnprintf to an intermediate buffer.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h |  1 +
 kernel/printk.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c4c862629d84..a1cf3568a64e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -97,6 +97,7 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int session_of_pgrp(int pgrp);
 
+asmlinkage int vprintk(const char *fmt, va_list args);
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 8b28dd2b4a98..2162a42c09d2 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -508,6 +508,17 @@ static void zap_locks(void)
 asmlinkage int printk(const char *fmt, ...)
 {
 	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = vprintk(fmt, args);
+	va_end(args);
+
+	return r;
+}
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
 	unsigned long flags;
 	int printed_len;
 	char *p;
@@ -521,9 +532,7 @@ asmlinkage int printk(const char *fmt, ...)
 	spin_lock_irqsave(&logbuf_lock, flags);
 
 	/* Emit the output into the temporary buffer */
-	va_start(args, fmt);
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
-	va_end(args);
 
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
@@ -575,6 +584,7 @@ out:
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(vprintk);
 
 /**
  * acquire_console_sem - lock the console system for exclusive use.
-- 
cgit v1.2.3


From e66c6753333b9c2b1cf5daf0b73814b5fdb09c41 Mon Sep 17 00:00:00 2001
From: Rajesh Venkatasubramanian <vrajesh@umich.edu>
Date: Sun, 22 Aug 2004 22:56:55 -0700
Subject: [PATCH] prio_tree: kill vma_prio_tree_init()

vma_prio_tree_insert() relies on the fact, that vma was
vma_prio_tree_init()'ed.

Content of vma->shared should be considered undefined, until this vma is
inserted into i_mmap/i_mmap_nonlinear.  It's better to do proper
initialization in vma_prio_tree_add/insert.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Rajesh Venkatasubramanian <vrajesh@umich.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 15 +++++++--------
 kernel/fork.c      |  1 -
 mm/fremap.c        |  4 +---
 mm/mmap.c          | 10 ++--------
 mm/prio_tree.c     |  5 +++++
 5 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5c584ccededa..0e30d4079c8a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -598,14 +598,6 @@ extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
 
-static inline void vma_prio_tree_init(struct vm_area_struct *vma)
-{
-	vma->shared.vm_set.list.next = NULL;
-	vma->shared.vm_set.list.prev = NULL;
-	vma->shared.vm_set.parent = NULL;
-	vma->shared.vm_set.head = NULL;
-}
-
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
@@ -614,6 +606,13 @@ struct vm_area_struct *vma_prio_tree_next(
 	struct vm_area_struct *, struct prio_tree_root *,
 	struct prio_tree_iter *, pgoff_t begin, pgoff_t end);
 
+static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
+					struct list_head *list)
+{
+	vma->shared.vm_set.parent = NULL;
+	list_add_tail(&vma->shared.vm_set.list, list);
+}
+
 /* mmap.c */
 extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e28c1fec202..233dd7f190d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -324,7 +324,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		tmp->vm_mm = mm;
 		tmp->vm_next = NULL;
 		anon_vma_link(tmp);
-		vma_prio_tree_init(tmp);
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file->f_dentry->d_inode;
diff --git a/mm/fremap.c b/mm/fremap.c
index dc64dd9dbe0b..fcd615827159 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -220,9 +220,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			flush_dcache_mmap_lock(mapping);
 			vma->vm_flags |= VM_NONLINEAR;
 			vma_prio_tree_remove(vma, &mapping->i_mmap);
-			vma_prio_tree_init(vma);
-			list_add_tail(&vma->shared.vm_set.list,
-					&mapping->i_mmap_nonlinear);
+			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 			flush_dcache_mmap_unlock(mapping);
 			spin_unlock(&mapping->i_mmap_lock);
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index 68ea8ab1f72f..3f7495c75228 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -279,8 +279,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
 
 		flush_dcache_mmap_lock(mapping);
 		if (unlikely(vma->vm_flags & VM_NONLINEAR))
-			list_add_tail(&vma->shared.vm_set.list,
-					&mapping->i_mmap_nonlinear);
+			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		else
 			vma_prio_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
@@ -449,11 +448,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 	}
 
 	if (root) {
-		if (adjust_next) {
-			vma_prio_tree_init(next);
+		if (adjust_next)
 			vma_prio_tree_insert(next, root);
-		}
-		vma_prio_tree_init(vma);
 		vma_prio_tree_insert(vma, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
@@ -1489,7 +1485,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 
 	/* most fields are the same, copy all, and then fixup */
 	*new = *vma;
-	vma_prio_tree_init(new);
 
 	if (new_below)
 		new->vm_end = addr;
@@ -1802,7 +1797,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
-			vma_prio_tree_init(new_vma);
 			pol = mpol_copy(vma_policy(vma));
 			if (IS_ERR(pol)) {
 				kmem_cache_free(vm_area_cachep, new_vma);
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 6cd41a831e2d..7608735b39b8 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -538,6 +538,9 @@ void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
 	BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
 	BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
 
+	vma->shared.vm_set.head = NULL;
+	vma->shared.vm_set.parent = NULL;
+
 	if (!old->shared.vm_set.parent)
 		list_add(&vma->shared.vm_set.list,
 				&old->shared.vm_set.list);
@@ -557,6 +560,8 @@ void vma_prio_tree_insert(struct vm_area_struct *vma,
 	struct prio_tree_node *ptr;
 	struct vm_area_struct *old;
 
+	vma->shared.vm_set.head = NULL;
+
 	ptr = prio_tree_insert(root, &vma->shared.prio_tree_node);
 	if (ptr != &vma->shared.prio_tree_node) {
 		old = prio_tree_entry(ptr, struct vm_area_struct,
-- 
cgit v1.2.3


From bcce63134d465edc461f81ef23627cde4227e05a Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Sun, 22 Aug 2004 22:57:19 -0700
Subject: [PATCH] RCU - cpu-offline-cleanup

There is a series of patches in my tree and these 3 are the first ones that
should probably be merged down the road.  Descriptions are on top of the
patches.  Please include them in -mm.

A lot of RCU code will be cleaned up later in order to support
call_rcu_bh(), the separate RCU interface that considers softirq handler
completion a quiescent state.

This patch:

Minor cleanup of the hotplug code to remove #ifdef in cpu event notifier
handler.  If CONFIG_HOTPLUG_CPU is not defined, CPU_DEAD case will be
optimized off.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index b331fe3f64e9..97cb4eaa1019 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -243,6 +243,12 @@ static void rcu_offline_cpu(int cpu)
 	tasklet_kill_immediate(&RCU_tasklet(cpu), cpu);
 }
 
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
 #endif
 
 void rcu_restart_cpu(int cpu)
@@ -326,11 +332,9 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_UP_PREPARE:
 		rcu_online_cpu(cpu);
 		break;
-#ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		rcu_offline_cpu(cpu);
 		break;
-#endif
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From f0f4d6e41008746f51db2c795469e1707e516672 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Sun, 22 Aug 2004 22:57:30 -0700
Subject: [PATCH] RCU - cpu offline fix

This fixes the RCU cpu offline code which was broken by singly-linked RCU
changes.  Nathan pointed out the problems and submitted a patch for this.
This is an optimal fix - no need to iterate through the list of callbacks,
just use the tail pointers and attach the list from the dead cpu.

Signed-off-by: Nathan Lynch <nathanl@austin.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  2 ++
 kernel/rcupdate.c        | 19 +++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 10c4b8f24f08..226ce765c031 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -98,6 +98,7 @@ struct rcu_data {
         struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
         struct rcu_head *curlist;
+        struct rcu_head **curtail;
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
@@ -111,6 +112,7 @@ extern struct rcu_ctrlblk rcu_ctrlblk;
 #define RCU_nxtlist(cpu) 	(per_cpu(rcu_data, (cpu)).nxtlist)
 #define RCU_curlist(cpu) 	(per_cpu(rcu_data, (cpu)).curlist)
 #define RCU_nxttail(cpu) 	(per_cpu(rcu_data, (cpu)).nxttail)
+#define RCU_curtail(cpu) 	(per_cpu(rcu_data, (cpu)).curtail)
 
 static inline int rcu_pending(int cpu) 
 {
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 97cb4eaa1019..5a8d9856610b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -210,19 +210,15 @@ static void rcu_check_quiescent_state(void)
  * locking requirements, the list it's pulling from has to belong to a cpu
  * which is dead and hence not processing interrupts.
  */
-static void rcu_move_batch(struct rcu_head *list)
+static void rcu_move_batch(struct rcu_head *list, struct rcu_head **tail)
 {
 	int cpu;
 
 	local_irq_disable();
-
 	cpu = smp_processor_id();
-
-	while (list != NULL) {
-		*RCU_nxttail(cpu) = list;
-		RCU_nxttail(cpu) = &list->next;
-		list = list->next;
-	}
+	*RCU_nxttail(cpu) = list;
+	if (list)
+		RCU_nxttail(cpu) = tail;
 	local_irq_enable();
 }
 
@@ -237,8 +233,8 @@ static void rcu_offline_cpu(int cpu)
 		cpu_quiet(cpu);
 	spin_unlock_bh(&rcu_state.mutex);
 
-	rcu_move_batch(RCU_curlist(cpu));
-	rcu_move_batch(RCU_nxtlist(cpu));
+	rcu_move_batch(RCU_curlist(cpu), RCU_curtail(cpu));
+	rcu_move_batch(RCU_nxtlist(cpu), RCU_nxttail(cpu));
 
 	tasklet_kill_immediate(&RCU_tasklet(cpu), cpu);
 }
@@ -271,6 +267,7 @@ static void rcu_process_callbacks(unsigned long unused)
 	    !rcu_batch_before(rcu_ctrlblk.completed, RCU_batch(cpu))) {
 		rcu_list = RCU_curlist(cpu);
 		RCU_curlist(cpu) = NULL;
+		RCU_curtail(cpu) = &RCU_curlist(cpu);
 	}
 
 	local_irq_disable();
@@ -278,6 +275,7 @@ static void rcu_process_callbacks(unsigned long unused)
 		int next_pending, seq;
 
 		RCU_curlist(cpu) = RCU_nxtlist(cpu);
+		RCU_curtail(cpu) = RCU_nxttail(cpu);
 		RCU_nxtlist(cpu) = NULL;
 		RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
 		local_irq_enable();
@@ -319,6 +317,7 @@ static void __devinit rcu_online_cpu(int cpu)
 {
 	memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
 	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
+	RCU_curtail(cpu) = &RCU_curlist(cpu);
 	RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
 	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
 	RCU_qs_pending(cpu) = 0;
-- 
cgit v1.2.3


From daf86b08a178f950c0e0ec073c25cc392dbbc789 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Sun, 22 Aug 2004 22:57:42 -0700
Subject: [PATCH] RCU: low latency rcu

This patch makes RCU callbacks friendly to scheduler.  It helps low latency
by limiting the number of callbacks invoked per tasklet handler.  Since we
cannot schedule during a single softirq handler, this reduces size of
non-preemptible section significantly, specially under heavy RCU updates.
The limiting is done through a kernel parameter rcupdate.maxbatch which is
the maximum number of RCU callbacks to invoke during a single tasklet
handler.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  7 +++++++
 kernel/rcupdate.c        | 27 +++++++++++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 226ce765c031..b689ab6108bd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -99,6 +99,8 @@ struct rcu_data {
 	struct rcu_head **nxttail;
         struct rcu_head *curlist;
         struct rcu_head **curtail;
+        struct rcu_head *donelist;
+        struct rcu_head **donetail;
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
@@ -113,6 +115,8 @@ extern struct rcu_ctrlblk rcu_ctrlblk;
 #define RCU_curlist(cpu) 	(per_cpu(rcu_data, (cpu)).curlist)
 #define RCU_nxttail(cpu) 	(per_cpu(rcu_data, (cpu)).nxttail)
 #define RCU_curtail(cpu) 	(per_cpu(rcu_data, (cpu)).curtail)
+#define RCU_donelist(cpu) 	(per_cpu(rcu_data, (cpu)).donelist)
+#define RCU_donetail(cpu) 	(per_cpu(rcu_data, (cpu)).donetail)
 
 static inline int rcu_pending(int cpu) 
 {
@@ -127,6 +131,9 @@ static inline int rcu_pending(int cpu)
 	if (!RCU_curlist(cpu) && RCU_nxtlist(cpu))
 		return 1;
 
+	if (RCU_donelist(cpu))
+		return 1;
+
 	/* The rcu core waits for a quiescent state from the cpu */
 	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.cur || RCU_qs_pending(cpu))
 		return 1;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 5a8d9856610b..c944504fc8d0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -40,6 +40,7 @@
 #include <asm/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
+#include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
@@ -63,6 +64,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 #define RCU_tasklet(cpu) (per_cpu(rcu_tasklet, cpu))
+static int maxbatch = 10;
 
 /**
  * call_rcu - Queue an RCU update request.
@@ -93,15 +95,23 @@ void fastcall call_rcu(struct rcu_head *head,
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
  */
-static void rcu_do_batch(struct rcu_head *list)
+static void rcu_do_batch(int cpu)
 {
-	struct rcu_head *next;
+	struct rcu_head *next, *list;
+	int count = 0;
 
+	list = RCU_donelist(cpu);
 	while (list) {
-		next = list->next;
+		next = RCU_donelist(cpu) = list->next;
 		list->func(list);
 		list = next;
+		if (++count >= maxbatch)
+			break;
 	}
+	if (!RCU_donelist(cpu))
+		RCU_donetail(cpu) = &RCU_donelist(cpu);
+	else
+		tasklet_schedule(&RCU_tasklet(cpu));
 }
 
 /*
@@ -261,11 +271,11 @@ void rcu_restart_cpu(int cpu)
 static void rcu_process_callbacks(unsigned long unused)
 {
 	int cpu = smp_processor_id();
-	struct rcu_head *rcu_list = NULL;
 
 	if (RCU_curlist(cpu) &&
 	    !rcu_batch_before(rcu_ctrlblk.completed, RCU_batch(cpu))) {
-		rcu_list = RCU_curlist(cpu);
+		*RCU_donetail(cpu) = RCU_curlist(cpu);
+		RCU_donetail(cpu) = RCU_curtail(cpu);
 		RCU_curlist(cpu) = NULL;
 		RCU_curtail(cpu) = &RCU_curlist(cpu);
 	}
@@ -300,8 +310,8 @@ static void rcu_process_callbacks(unsigned long unused)
 		local_irq_enable();
 	}
 	rcu_check_quiescent_state();
-	if (rcu_list)
-		rcu_do_batch(rcu_list);
+	if (RCU_donelist(cpu))
+		rcu_do_batch(cpu);
 }
 
 void rcu_check_callbacks(int cpu, int user)
@@ -319,6 +329,7 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
 	RCU_curtail(cpu) = &RCU_curlist(cpu);
 	RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
+	RCU_donetail(cpu) = &RCU_donelist(cpu);
 	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
 	RCU_qs_pending(cpu) = 0;
 }
@@ -388,6 +399,6 @@ void synchronize_kernel(void)
 	wait_for_completion(&rcu.completion);
 }
 
-
+module_param(maxbatch, int, 0);
 EXPORT_SYMBOL(call_rcu);
 EXPORT_SYMBOL(synchronize_kernel);
-- 
cgit v1.2.3


From e86e2311ae3c844a1efe4f6d569dfb548f7f58c7 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Sun, 22 Aug 2004 22:57:53 -0700
Subject: [PATCH] rcu: clean up code

Avoids per_cpu calculations and also prepares for call_rcu_bh().

At OLS, Rusty had suggested getting rid of many per_cpu() calculations in RCU
code and making the code simpler.  I had already done that for the rcu-softirq
patch earlier, so I am splitting that into two patch.  This first patch cleans
up the macros and uses pointers to the rcu per-cpu data directly to manipulate
the callback queues.  This is useful for the call-rcu-bh patch (to follow)
which introduces a new RCU mechanism - call_rcu_bh().  Both generic and
softirq rcu can then use the same code, they work different global and percpu
data.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  40 +++++----
 kernel/rcupdate.c        | 229 +++++++++++++++++++++++++----------------------
 kernel/sched.c           |   2 +-
 3 files changed, 144 insertions(+), 127 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b689ab6108bd..45ca384109e1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -101,47 +101,51 @@ struct rcu_data {
         struct rcu_head **curtail;
         struct rcu_head *donelist;
         struct rcu_head **donetail;
+	int cpu;
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
 extern struct rcu_ctrlblk rcu_ctrlblk;
 
-#define RCU_quiescbatch(cpu)	(per_cpu(rcu_data, (cpu)).quiescbatch)
-#define RCU_qsctr(cpu) 		(per_cpu(rcu_data, (cpu)).qsctr)
-#define RCU_last_qsctr(cpu) 	(per_cpu(rcu_data, (cpu)).last_qsctr)
-#define RCU_qs_pending(cpu)	(per_cpu(rcu_data, (cpu)).qs_pending)
-#define RCU_batch(cpu) 		(per_cpu(rcu_data, (cpu)).batch)
-#define RCU_nxtlist(cpu) 	(per_cpu(rcu_data, (cpu)).nxtlist)
-#define RCU_curlist(cpu) 	(per_cpu(rcu_data, (cpu)).curlist)
-#define RCU_nxttail(cpu) 	(per_cpu(rcu_data, (cpu)).nxttail)
-#define RCU_curtail(cpu) 	(per_cpu(rcu_data, (cpu)).curtail)
-#define RCU_donelist(cpu) 	(per_cpu(rcu_data, (cpu)).donelist)
-#define RCU_donetail(cpu) 	(per_cpu(rcu_data, (cpu)).donetail)
-
-static inline int rcu_pending(int cpu) 
+/*
+ * Increment the quiscent state counter.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->qsctr++;
+}
+
+static inline int __rcu_pending(struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
 {
 	/* This cpu has pending rcu entries and the grace period
 	 * for them has completed.
 	 */
-	if (RCU_curlist(cpu) &&
-		  !rcu_batch_before(rcu_ctrlblk.completed,RCU_batch(cpu)))
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
 		return 1;
 
 	/* This cpu has no pending entries, but there are new entries */
-	if (!RCU_curlist(cpu) && RCU_nxtlist(cpu))
+	if (!rdp->curlist && rdp->nxtlist)
 		return 1;
 
-	if (RCU_donelist(cpu))
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
 		return 1;
 
 	/* The rcu core waits for a quiescent state from the cpu */
-	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.cur || RCU_qs_pending(cpu))
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
 		return 1;
 
 	/* nothing to do */
 	return 0;
 }
 
+static inline int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu));
+}
+
 #define rcu_read_lock()		preempt_disable()
 #define rcu_read_unlock()	preempt_enable()
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c944504fc8d0..d950c5c94158 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -17,9 +17,10 @@
  *
  * Copyright (C) IBM Corporation, 2001
  *
- * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
  * 
- * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  * Papers:
  * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
@@ -51,19 +52,20 @@ struct rcu_ctrlblk rcu_ctrlblk =
 	{ .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO };
 
 /* Bookkeeping of the progress of the grace period */
-struct {
-	spinlock_t	mutex; /* Guard this struct and writes to rcu_ctrlblk */
-	cpumask_t	rcu_cpu_mask; /* CPUs that need to switch in order    */
+struct rcu_state {
+	spinlock_t	lock; /* Guard this struct and writes to rcu_ctrlblk */
+	cpumask_t	cpumask; /* CPUs that need to switch in order    */
 	                              /* for current batch to proceed.        */
-} rcu_state ____cacheline_maxaligned_in_smp =
-	  {.mutex = SPIN_LOCK_UNLOCKED, .rcu_cpu_mask = CPU_MASK_NONE };
+};
+
+struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-#define RCU_tasklet(cpu) (per_cpu(rcu_tasklet, cpu))
 static int maxbatch = 10;
 
 /**
@@ -79,15 +81,15 @@ static int maxbatch = 10;
 void fastcall call_rcu(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
 {
-	int cpu;
 	unsigned long flags;
+	struct rcu_data *rdp;
 
 	head->func = func;
 	head->next = NULL;
 	local_irq_save(flags);
-	cpu = smp_processor_id();
-	*RCU_nxttail(cpu) = head;
-	RCU_nxttail(cpu) = &head->next;
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
 	local_irq_restore(flags);
 }
 
@@ -95,23 +97,23 @@ void fastcall call_rcu(struct rcu_head *head,
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
  */
-static void rcu_do_batch(int cpu)
+static void rcu_do_batch(struct rcu_data *rdp)
 {
 	struct rcu_head *next, *list;
 	int count = 0;
 
-	list = RCU_donelist(cpu);
+	list = rdp->donelist;
 	while (list) {
-		next = RCU_donelist(cpu) = list->next;
+		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
 		if (++count >= maxbatch)
 			break;
 	}
-	if (!RCU_donelist(cpu))
-		RCU_donetail(cpu) = &RCU_donelist(cpu);
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
 	else
-		tasklet_schedule(&RCU_tasklet(cpu));
+		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
 }
 
 /*
@@ -119,15 +121,15 @@ static void rcu_do_batch(int cpu)
  * The grace period handling consists out of two steps:
  * - A new grace period is started.
  *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcu_ctrlblk.cur with
- *   RCU_quiescbatch(cpu). All cpus are recorded  in the
- *   rcu_state.rcu_cpu_mask bitmap.
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_state.cpumask bitmap.
  * - All cpus must go through a quiescent state.
  *   Since the start of the grace period is not broadcasted, at least two
  *   calls to rcu_check_quiescent_state are required:
  *   The first call just notices that a new grace period is running. The
  *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_state.rcu_cpu_mask. If
+ *   of the grace period. If so, it updates rcu_state.cpumask. If
  *   the bitmap is empty, then the grace period is completed.
  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
  *   period (if necessary).
@@ -135,22 +137,22 @@ static void rcu_do_batch(int cpu)
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_state.mutex.
+ * Caller must hold rcu_state.lock.
  */
-static void rcu_start_batch(int next_pending)
+static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
+				int next_pending)
 {
 	if (next_pending)
-		rcu_ctrlblk.next_pending = 1;
+		rcp->next_pending = 1;
 
-	if (rcu_ctrlblk.next_pending &&
-			rcu_ctrlblk.completed == rcu_ctrlblk.cur) {
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
 		/* Can't change, since spin lock held. */
-		cpus_andnot(rcu_state.rcu_cpu_mask, cpu_online_map,
-							nohz_cpu_mask);
-		write_seqcount_begin(&rcu_ctrlblk.lock);
-		rcu_ctrlblk.next_pending = 0;
-		rcu_ctrlblk.cur++;
-		write_seqcount_end(&rcu_ctrlblk.lock);
+		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
+		write_seqcount_begin(&rcp->lock);
+		rcp->next_pending = 0;
+		rcp->cur++;
+		write_seqcount_end(&rcp->lock);
 	}
 }
 
@@ -159,13 +161,13 @@ static void rcu_start_batch(int next_pending)
  * Clear it from the cpu mask and complete the grace period if it was the last
  * cpu. Start another grace period if someone has further entries pending
  */
-static void cpu_quiet(int cpu)
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
 {
-	cpu_clear(cpu, rcu_state.rcu_cpu_mask);
-	if (cpus_empty(rcu_state.rcu_cpu_mask)) {
+	cpu_clear(cpu, rsp->cpumask);
+	if (cpus_empty(rsp->cpumask)) {
 		/* batch completed ! */
-		rcu_ctrlblk.completed = rcu_ctrlblk.cur;
-		rcu_start_batch(0);
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp, rsp, 0);
 	}
 }
 
@@ -174,15 +176,14 @@ static void cpu_quiet(int cpu)
  * switch). If so and if it already hasn't done so in this RCU
  * quiescent cycle, then indicate that it has done so.
  */
-static void rcu_check_quiescent_state(void)
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+			struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	int cpu = smp_processor_id();
-
-	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.cur) {
+	if (rdp->quiescbatch != rcp->cur) {
 		/* new grace period: record qsctr value. */
-		RCU_qs_pending(cpu) = 1;
-		RCU_last_qsctr(cpu) = RCU_qsctr(cpu);
-		RCU_quiescbatch(cpu) = rcu_ctrlblk.cur;
+		rdp->qs_pending = 1;
+		rdp->last_qsctr = rdp->qsctr;
+		rdp->quiescbatch = rcp->cur;
 		return;
 	}
 
@@ -190,7 +191,7 @@ static void rcu_check_quiescent_state(void)
 	 * qs_pending is checked instead of the actual bitmap to avoid
 	 * cacheline trashing.
 	 */
-	if (!RCU_qs_pending(cpu))
+	if (!rdp->qs_pending)
 		return;
 
 	/* 
@@ -198,19 +199,19 @@ static void rcu_check_quiescent_state(void)
 	 * we may miss one quiescent state of that CPU. That is
 	 * tolerable. So no need to disable interrupts.
 	 */
-	if (RCU_qsctr(cpu) == RCU_last_qsctr(cpu))
+	if (rdp->qsctr == rdp->last_qsctr)
 		return;
-	RCU_qs_pending(cpu) = 0;
+	rdp->qs_pending = 0;
 
-	spin_lock(&rcu_state.mutex);
+	spin_lock(&rsp->lock);
 	/*
-	 * RCU_quiescbatch/batch.cur and the cpu bitmap can come out of sync
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
 	 * during cpu startup. Ignore the quiescent state.
 	 */
-	if (likely(RCU_quiescbatch(cpu) == rcu_ctrlblk.cur))
-		cpu_quiet(cpu);
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp, rsp);
 
-	spin_unlock(&rcu_state.mutex);
+	spin_unlock(&rsp->lock);
 }
 
 
@@ -220,33 +221,39 @@ static void rcu_check_quiescent_state(void)
  * locking requirements, the list it's pulling from has to belong to a cpu
  * which is dead and hence not processing interrupts.
  */
-static void rcu_move_batch(struct rcu_head *list, struct rcu_head **tail)
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
 {
-	int cpu;
-
 	local_irq_disable();
-	cpu = smp_processor_id();
-	*RCU_nxttail(cpu) = list;
+	*this_rdp->nxttail = list;
 	if (list)
-		RCU_nxttail(cpu) = tail;
+		this_rdp->nxttail = tail;
 	local_irq_enable();
 }
 
-static void rcu_offline_cpu(int cpu)
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+	struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	/* if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_bh(&rcu_state.mutex);
-	if (rcu_ctrlblk.cur != rcu_ctrlblk.completed)
-		cpu_quiet(cpu);
-	spin_unlock_bh(&rcu_state.mutex);
+	spin_lock_bh(&rsp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp, rsp);
+	spin_unlock_bh(&rsp->lock);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
 
-	rcu_move_batch(RCU_curlist(cpu), RCU_curtail(cpu));
-	rcu_move_batch(RCU_nxtlist(cpu), RCU_nxttail(cpu));
+}
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
 
-	tasklet_kill_immediate(&RCU_tasklet(cpu), cpu);
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
+					&per_cpu(rcu_data, cpu));
+	put_cpu_var(rcu_data);
+	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
 }
 
 #else
@@ -257,81 +264,87 @@ static void rcu_offline_cpu(int cpu)
 
 #endif
 
-void rcu_restart_cpu(int cpu)
-{
-	spin_lock_bh(&rcu_state.mutex);
-	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
-	RCU_qs_pending(cpu) = 0;
-	spin_unlock_bh(&rcu_state.mutex);
-}
-
 /*
  * This does the RCU processing work from tasklet context. 
  */
-static void rcu_process_callbacks(unsigned long unused)
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+			struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	int cpu = smp_processor_id();
-
-	if (RCU_curlist(cpu) &&
-	    !rcu_batch_before(rcu_ctrlblk.completed, RCU_batch(cpu))) {
-		*RCU_donetail(cpu) = RCU_curlist(cpu);
-		RCU_donetail(cpu) = RCU_curtail(cpu);
-		RCU_curlist(cpu) = NULL;
-		RCU_curtail(cpu) = &RCU_curlist(cpu);
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
 	}
 
 	local_irq_disable();
-	if (RCU_nxtlist(cpu) && !RCU_curlist(cpu)) {
+	if (rdp->nxtlist && !rdp->curlist) {
 		int next_pending, seq;
 
-		RCU_curlist(cpu) = RCU_nxtlist(cpu);
-		RCU_curtail(cpu) = RCU_nxttail(cpu);
-		RCU_nxtlist(cpu) = NULL;
-		RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
 		local_irq_enable();
 
 		/*
 		 * start the next batch of callbacks
 		 */
 		do {
-			seq = read_seqcount_begin(&rcu_ctrlblk.lock);
+			seq = read_seqcount_begin(&rcp->lock);
 			/* determine batch number */
-			RCU_batch(cpu) = rcu_ctrlblk.cur + 1;
-			next_pending = rcu_ctrlblk.next_pending;
-		} while (read_seqcount_retry(&rcu_ctrlblk.lock, seq));
+			rdp->batch = rcp->cur + 1;
+			next_pending = rcp->next_pending;
+		} while (read_seqcount_retry(&rcp->lock, seq));
 
 		if (!next_pending) {
 			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcu_state.mutex);
-			rcu_start_batch(1);
-			spin_unlock(&rcu_state.mutex);
+			spin_lock(&rsp->lock);
+			rcu_start_batch(rcp, rsp, 1);
+			spin_unlock(&rsp->lock);
 		}
 	} else {
 		local_irq_enable();
 	}
-	rcu_check_quiescent_state();
-	if (RCU_donelist(cpu))
-		rcu_do_batch(cpu);
+	rcu_check_quiescent_state(rcp, rsp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(unsigned long unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
+				&__get_cpu_var(rcu_data));
 }
 
 void rcu_check_callbacks(int cpu, int user)
 {
+	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
 	if (user || 
 	    (idle_cpu(cpu) && !in_softirq() && 
 				hardirq_count() <= (1 << HARDIRQ_SHIFT)))
-		RCU_qsctr(cpu)++;
-	tasklet_schedule(&RCU_tasklet(cpu));
+		rdp->qsctr++;
+	tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
 }
 
 static void __devinit rcu_online_cpu(int cpu)
 {
-	memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
-	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
-	RCU_curtail(cpu) = &RCU_curlist(cpu);
-	RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
-	RCU_donetail(cpu) = &RCU_donelist(cpu);
-	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
-	RCU_qs_pending(cpu) = 0;
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
 static int __devinit rcu_cpu_notify(struct notifier_block *self, 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e8897919924..2d4d157bc145 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2287,7 +2287,7 @@ need_resched:
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
-	RCU_qsctr(task_cpu(prev))++;
+	rcu_qsctr_inc(task_cpu(prev));
 
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0) {
-- 
cgit v1.2.3


From f4d4d3f33efca5261c6313ead7d4b6d3a6add711 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Sun, 22 Aug 2004 22:58:16 -0700
Subject: [PATCH] rcu: introduce call_rcu_bh()

Introduces call_rcu_bh() to be used when critical sections are mostly in
softirq context.

This patch introduces a new api - call_rcu_bh().  This is to be used for RCU
callbacks for whom the critical sections are mostly in softirq context.  These
callbacks consider completion of a softirq handler to be a quiescent state.
So, in order to make reader critical sections safe in process context,
rcu_read_lock_bh() and rcu_read_unlock_bh() must be used.  Use of softirq
handler completion as a quiescent state speeds up RCU grace periods and
prevents too many callbacks getting queued up in softirq-heavy workloads like
network stack.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 14 ++++++++++++-
 kernel/rcupdate.c        | 53 +++++++++++++++++++++++++++++++++++++++++++-----
 kernel/softirq.c         |  7 ++++++-
 3 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 8c8157be11f7..f003f8ff9789 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -105,7 +105,9 @@ struct rcu_data {
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 extern struct rcu_ctrlblk rcu_ctrlblk;
+extern struct rcu_ctrlblk rcu_bh_ctrlblk;
 
 /*
  * Increment the quiscent state counter.
@@ -115,6 +117,11 @@ static inline void rcu_qsctr_inc(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	rdp->qsctr++;
 }
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->qsctr++;
+}
 
 static inline int __rcu_pending(struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
@@ -143,11 +150,14 @@ static inline int __rcu_pending(struct rcu_ctrlblk *rcp,
 
 static inline int rcu_pending(int cpu)
 {
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu));
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 }
 
 #define rcu_read_lock()		preempt_disable()
 #define rcu_read_unlock()	preempt_enable()
+#define rcu_read_lock_bh()	local_bh_disable()
+#define rcu_read_unlock_bh()	local_bh_enable()
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
@@ -156,6 +166,8 @@ extern void rcu_restart_cpu(int cpu);
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
 				void (*func)(struct rcu_head *head)));
+extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *head)));
 extern void synchronize_kernel(void);
 
 #endif /* __KERNEL__ */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d950c5c94158..3d25d0b18571 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -50,6 +50,8 @@
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
 	{ .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO };
+struct rcu_ctrlblk rcu_bh_ctrlblk =
+	{ .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO };
 
 /* Bookkeeping of the progress of the grace period */
 struct rcu_state {
@@ -60,9 +62,11 @@ struct rcu_state {
 
 struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-
+struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
@@ -93,6 +97,34 @@ void fastcall call_rcu(struct rcu_head *head,
 	local_irq_restore(flags);
 }
 
+/**
+ * call_rcu_bh - Queue an RCU update request for which softirq handler
+ * completion is a quiescent state.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked as soon as all CPUs have performed
+ * a context switch or been seen in the idle loop or in a user process
+ * or has exited a softirq handler that it may have been executing.
+ * The read-side of critical section that use call_rcu_bh() for updation must
+ * be protected by rcu_read_lock_bh()/rcu_read_unlock_bh() if it is
+ * in process context.
+ */
+void fastcall call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	local_irq_restore(flags);
+}
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -249,10 +281,14 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 static void rcu_offline_cpu(int cpu)
 {
 	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
 
 	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
 					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state,
+					&per_cpu(rcu_bh_data, cpu));
 	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
 	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
 }
 
@@ -315,16 +351,20 @@ static void rcu_process_callbacks(unsigned long unused)
 {
 	__rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
 				&__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state,
+				&__get_cpu_var(rcu_bh_data));
 }
 
 void rcu_check_callbacks(int cpu, int user)
 {
-	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
 	if (user || 
 	    (idle_cpu(cpu) && !in_softirq() && 
-				hardirq_count() <= (1 << HARDIRQ_SHIFT)))
-		rdp->qsctr++;
-	tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
 }
 
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
@@ -342,8 +382,10 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 static void __devinit rcu_online_cpu(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
 
 	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
 	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
@@ -414,4 +456,5 @@ void synchronize_kernel(void)
 
 module_param(maxbatch, int, 0);
 EXPORT_SYMBOL(call_rcu);
+EXPORT_SYMBOL(call_rcu_bh);
 EXPORT_SYMBOL(synchronize_kernel);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c336ae21b5d7..4a3da9be9f26 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -15,6 +15,7 @@
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/kthread.h>
+#include <linux/rcupdate.h>
 
 #include <asm/irq.h>
 /*
@@ -75,10 +76,12 @@ asmlinkage void __do_softirq(void)
 	struct softirq_action *h;
 	__u32 pending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
+	int cpu;
 
 	pending = local_softirq_pending();
 
 	local_bh_disable();
+	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
 	local_softirq_pending() = 0;
@@ -88,8 +91,10 @@ restart:
 	h = softirq_vec;
 
 	do {
-		if (pending & 1)
+		if (pending & 1) {
 			h->action(h);
+			rcu_bh_qsctr_inc(cpu);
+		}
 		h++;
 		pending >>= 1;
 	} while (pending);
-- 
cgit v1.2.3


From 9711268caede0cbd322244d70145a6e914fac52e Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Sun, 22 Aug 2004 22:58:39 -0700
Subject: [PATCH] rcu: document RCU api

Patch from Paul for additional documentation of api.

Updated based on feedback, and to apply to 2.6.8-rc3.  I will be adding more
detailed documentation to the Documentation directory in a separate patch.

Signed-off-by: Paul McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcupdate.c        | 39 ++++++++++++++++++------------
 2 files changed, 87 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f003f8ff9789..32a8893a3cdd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -154,9 +154,72 @@ static inline int rcu_pending(int cpu)
 		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 }
 
+/**
+ * rcu_read_lock - mark the beginning of an RCU read-side critical section.
+ *
+ * When synchronize_kernel() is invoked on one CPU while other CPUs
+ * are within RCU read-side critical sections, then the
+ * synchronize_kernel() is guaranteed to block until after all the other
+ * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
+ * on one CPU while other CPUs are within RCU read-side critical
+ * sections, invocation of the corresponding RCU callback is deferred
+ * until after the all the other CPUs exit their critical sections.
+ *
+ * Note, however, that RCU callbacks are permitted to run concurrently
+ * with RCU read-side critical sections.  One way that this can happen
+ * is via the following sequence of events: (1) CPU 0 enters an RCU
+ * read-side critical section, (2) CPU 1 invokes call_rcu() to register
+ * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
+ * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
+ * callback is invoked.  This is legal, because the RCU read-side critical
+ * section that was running concurrently with the call_rcu() (and which
+ * therefore might be referencing something that the corresponding RCU
+ * callback would free up) has completed before the corresponding
+ * RCU callback is invoked.
+ *
+ * RCU read-side critical sections may be nested.  Any deferred actions
+ * will be deferred until the outermost RCU read-side critical section
+ * completes.
+ *
+ * It is illegal to block while in an RCU read-side critical section.
+ */
 #define rcu_read_lock()		preempt_disable()
+
+/**
+ * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ *
+ * See rcu_read_lock() for more information.
+ */
 #define rcu_read_unlock()	preempt_enable()
+
+/*
+ * So where is rcu_write_lock()?  It does not exist, as there is no
+ * way for writers to lock out RCU readers.  This is a feature, not
+ * a bug -- this property is what provides RCU's performance benefits.
+ * Of course, writers must coordinate with each other.  The normal
+ * spinlock primitives work well for this, but any other technique may be
+ * used as well.  RCU does not care how the writers keep out of each
+ * others' way, as long as they do so.
+ */
+
+/**
+ * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
+ *
+ * This is equivalent of rcu_read_lock(), but to be used when updates
+ * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks
+ * consider completion of a softirq handler to be a quiescent state,
+ * a process in RCU read-side critical section must be protected by
+ * disabling softirqs. Read-side critical sections in interrupt context
+ * can use just rcu_read_lock().
+ *
+ */
 #define rcu_read_lock_bh()	local_bh_disable()
+
+/*
+ * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
+ *
+ * See rcu_read_lock_bh() for more information.
+ */
 #define rcu_read_unlock_bh()	local_bh_enable()
 
 extern void rcu_init(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3d25d0b18571..1b16bfc7d1ee 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,14 +73,15 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10;
 
 /**
- * call_rcu - Queue an RCU update request.
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
  * @func: actual update function to be invoked after the grace period
  *
- * The update function will be invoked as soon as all CPUs have performed 
- * a context switch or been seen in the idle loop or in a user process. 
- * The read-side of critical section that use call_rcu() for updation must 
- * be protected by rcu_read_lock()/rcu_read_unlock().
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
  */
 void fastcall call_rcu(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
@@ -98,17 +99,20 @@ void fastcall call_rcu(struct rcu_head *head,
 }
 
 /**
- * call_rcu_bh - Queue an RCU update request for which softirq handler
- * completion is a quiescent state.
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
  * @func: actual update function to be invoked after the grace period
  *
- * The update function will be invoked as soon as all CPUs have performed
- * a context switch or been seen in the idle loop or in a user process
- * or has exited a softirq handler that it may have been executing.
- * The read-side of critical section that use call_rcu_bh() for updation must
- * be protected by rcu_read_lock_bh()/rcu_read_unlock_bh() if it is
- * in process context.
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
  */
 void fastcall call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
@@ -439,8 +443,13 @@ static void wakeme_after_rcu(struct rcu_head  *head)
 }
 
 /**
- * synchronize-kernel - wait until all the CPUs have gone
- * through a "quiescent" state. It may sleep.
+ * synchronize_kernel - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
  */
 void synchronize_kernel(void)
 {
-- 
cgit v1.2.3


From d4f9d02b9151b9ff87a950ed42220de4f740d27b Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sun, 22 Aug 2004 22:59:16 -0700
Subject: [PATCH] token based thrashing control

The following experimental patch implements token based thrashing
protection, using the algorithm described in:

	http://www.cs.wm.edu/~sjiang/token.htm

When there are pageins going on, a task can grab a token, that protects the
task from pageout (except by itself) until it is no longer doing heavy
pageins, or until the maximum hold time of the token is over.

If the maximum hold time is exceeded, the task isn't eligable to hold the
token for a while more, since it wasn't doing it much good anyway.

I have run a very unscientific benchmark on my system to test the
effectiveness of the patch, timing how a 230MB two-process qsbench run
takes, with and without the token thrashing protection present.

normal 2.6.8-rc6:	6m45s
2.6.8-rc6 + token:	4m24s

This is a quick hack, implemented without having talked to the inventor of
the algorithm.  He's copied on the mail and I suspect we'll be able to do
better than my quick implementation ...

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  4 +++
 include/linux/swap.h  | 22 +++++++++++-
 kernel/fork.c         |  2 ++
 mm/Makefile           |  2 +-
 mm/filemap.c          |  1 +
 mm/memory.c           |  1 +
 mm/rmap.c             |  3 ++
 mm/thrash.c           | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 mm/thrash.c

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 65256f313eb6..a01f849da7a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -220,6 +220,10 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* Token based thrashing protection. */
+	unsigned long swap_token_time;
+	char recent_pagein;
+
 	/* coredumping support */
 	int core_waiters;
 	struct completion *core_startup_done, core_done;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b081066b5f11..371e8260c577 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -204,7 +204,6 @@ extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
 					   unsigned long addr);
-
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
@@ -229,6 +228,22 @@ extern spinlock_t swaplock;
 #define swap_device_lock(p)	spin_lock(&p->sdev_lock)
 #define swap_device_unlock(p)	spin_unlock(&p->sdev_lock)
 
+/* linux/mm/thrash.c */
+extern struct mm_struct * swap_token_mm;
+extern void grab_swap_token(void);
+extern void __put_swap_token(struct mm_struct *);
+
+static inline int has_swap_token(struct mm_struct *mm)
+{
+	return (mm == swap_token_mm);
+}
+
+static inline void put_swap_token(struct mm_struct *mm)
+{
+	if (has_swap_token(mm))
+		__put_swap_token(mm);
+}
+
 #else /* CONFIG_SWAP */
 
 #define total_swap_pages			0
@@ -266,6 +281,11 @@ static inline swp_entry_t get_swap_page(void)
 	return entry;
 }
 
+/* linux/mm/thrash.c */
+#define put_swap_token(x) do { } while(0)
+#define grab_swap_token()  do { } while(0)
+#define has_swap_token(x) 0
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 233dd7f190d6..601abf6bbbb8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,6 +29,7 @@
 #include <linux/fs.h>
 #include <linux/cpu.h>
 #include <linux/security.h>
+#include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
@@ -461,6 +462,7 @@ void mmput(struct mm_struct *mm)
 		spin_unlock(&mmlist_lock);
 		exit_aio(mm);
 		exit_mmap(mm);
+		put_swap_token(mm);
 		mmdrop(mm);
 	}
 }
diff --git a/mm/Makefile b/mm/Makefile
index d22feb38a1f9..366e50de11bd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,6 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o slab.o swap.o truncate.o vmscan.o \
 			   $(mmu-y)
 
-obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 044147636410..a29efae1832c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1195,6 +1195,7 @@ no_cached_page:
 	 * effect.
 	 */
 	error = page_cache_read(file, pgoff);
+	grab_swap_token();
 
 	/*
 	 * The page we want has now been added to the page cache.
diff --git a/mm/memory.c b/mm/memory.c
index 57f869fc54ca..8dfcd810f78e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1351,6 +1351,7 @@ static int do_swap_page(struct mm_struct * mm,
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		inc_page_state(pgmajfault);
+		grab_swap_token();
 	}
 
 	mark_page_accessed(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index d4208d6d8eff..1cb3353daa16 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -230,6 +230,9 @@ static int page_referenced_one(struct page *page,
 	if (ptep_clear_flush_young(vma, address, pte))
 		referenced++;
 
+	if (mm != current->mm && has_swap_token(mm))
+		referenced++;
+
 	(*mapcount)--;
 
 out_unmap:
diff --git a/mm/thrash.c b/mm/thrash.c
new file mode 100644
index 000000000000..7183937b24e5
--- /dev/null
+++ b/mm/thrash.c
@@ -0,0 +1,98 @@
+/*
+ * mm/thrash.c
+ *
+ * Copyright (C) 2004, Red Hat, Inc.
+ * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * Simple token based thrashing protection, using the algorithm
+ * described in:  http://www.cs.wm.edu/~sjiang/token.pdf
+ */
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+
+static spinlock_t swap_token_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long swap_token_timeout;
+unsigned long swap_token_check;
+struct mm_struct * swap_token_mm = &init_mm;
+
+#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
+#define SWAP_TOKEN_TIMEOUT (HZ * 300)
+
+/*
+ * Take the token away if the process had no page faults
+ * in the last interval, or if it has held the token for
+ * too long.
+ */
+#define SWAP_TOKEN_ENOUGH_RSS 1
+#define SWAP_TOKEN_TIMED_OUT 2
+static int should_release_swap_token(struct mm_struct *mm)
+{
+	int ret = 0;
+	if (!mm->recent_pagein)
+		ret = SWAP_TOKEN_ENOUGH_RSS;
+	else if (time_after(jiffies, swap_token_timeout))
+		ret = SWAP_TOKEN_TIMED_OUT;
+	mm->recent_pagein = 0;
+	return ret;
+}
+
+/*
+ * Try to grab the swapout protection token.  We only try to
+ * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
+ * SMP lock contention and to check that the process that held
+ * the token before is no longer thrashing.
+ */
+void grab_swap_token(void)
+{
+	struct mm_struct *mm;
+	int reason;
+
+	/* We have the token. Let others know we still need it. */
+	if (has_swap_token(current->mm)) {
+		current->mm->recent_pagein = 1;
+		return;
+	}
+
+	if (time_after(jiffies, swap_token_check)) {
+
+		/* Can't get swapout protection if we exceed our RSS limit. */
+		// if (current->mm->rss > current->mm->rlimit_rss)
+		//	return;
+
+		/* ... or if we recently held the token. */
+		if (time_before(jiffies, current->mm->swap_token_time))
+			return;
+
+		if (!spin_trylock(&swap_token_lock))
+			return;
+
+		swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+
+		mm = swap_token_mm;
+		if ((reason = should_release_swap_token(mm))) {
+			unsigned long eligible = jiffies;
+			if (reason == SWAP_TOKEN_TIMED_OUT) {
+				eligible += SWAP_TOKEN_TIMEOUT;
+			}
+			mm->swap_token_time = eligible;
+			swap_token_timeout = jiffies + SWAP_TOKEN_TIMEOUT;
+			swap_token_mm = current->mm;
+		}
+		spin_unlock(&swap_token_lock);
+	}
+	return;
+}
+
+/* Called on process exit. */
+void __put_swap_token(struct mm_struct *mm)
+{
+	spin_lock(&swap_token_lock);
+	if (likely(mm == swap_token_mm)) {
+		swap_token_mm = &init_mm;
+		swap_token_check = jiffies;
+	}
+	spin_unlock(&swap_token_lock);
+}
-- 
cgit v1.2.3


From 16698c49bbb42567c0bbc528d3820d18885e4642 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sun, 22 Aug 2004 23:06:46 -0700
Subject: [PATCH] rlimit-based mlocks for unprivileged users

Here is the last agreed-on patch that lets normal users mlock pages up to
their rlimit.  This patch addresses all the issues brought up by Chris and
Andrea.

From: Chris Wright <chrisw@osdl.org>

Couple more nits.

The default lockable amount is one page now (first patch is was 0).  Why
don't we keep it as 0, with the CAP_IPC_LOCK overrides in place?  That way
nothing is changed from user perspective, and the rest of the policy can be
done by userspace as it should.

This patch breaks in one scenario.  When ulimit == 0, process has
CAP_IPC_LOCK, and does SHM_LOCK.  The subsequent unlock or destroy will
corrupt the locked_shm count.

It's also inconsistent in handling user_can_mlock/CAP_IPC_LOCK interaction
betwen shm_lock and shm_hugetlb.

SHM_HUGETLB can now only be done by the shm_group or CAP_IPC_LOCK.
Not any can_do_mlock() user.

Double check of can_do_mlock isn't needed in SHM_LOCK path.

Interface names user_can_mlock and user_substract_mlock could be better.

Incremental update below.  Ran some simple sanity tests on this plus my
patch below and didn't find any problems.

* Make default RLIM_MEMLOCK limit 0.
* Move CAP_IPC_LOCK check into user_can_mlock to be consistent
  and fix but with ulimit == 0 && CAP_IPC_LOCK with SHM_LOCK.
* Allow can_do_mlock() user to try SHM_HUGETLB setup.
* Remove unecessary extra can_do_mlock() test in shmem_lock().
* Rename user_can_mlock to user_shm_lock and user_subtract_mlock
  to user_shm_unlock.
* Use user instead of current->user to fit in 80 cols on SHM_LOCK.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c           | 12 +++++++++---
 include/asm-alpha/resource.h   |  2 +-
 include/asm-arm/resource.h     |  2 +-
 include/asm-arm26/resource.h   |  2 +-
 include/asm-cris/resource.h    |  2 +-
 include/asm-h8300/resource.h   |  2 +-
 include/asm-i386/resource.h    |  2 +-
 include/asm-ia64/resource.h    |  2 +-
 include/asm-m68k/resource.h    |  2 +-
 include/asm-parisc/resource.h  |  2 +-
 include/asm-ppc/resource.h     |  2 +-
 include/asm-ppc64/resource.h   |  2 +-
 include/asm-s390/resource.h    |  2 +-
 include/asm-sh/resource.h      |  2 +-
 include/asm-sparc/resource.h   |  2 +-
 include/asm-sparc64/resource.h |  2 +-
 include/asm-v850/resource.h    |  2 +-
 include/asm-x86_64/resource.h  |  2 +-
 include/linux/mm.h             | 13 ++++++++++++-
 include/linux/sched.h          |  1 +
 include/linux/shm.h            |  1 +
 ipc/shm.c                      | 36 ++++++++++++++++++++++--------------
 kernel/user.c                  |  4 +++-
 mm/mlock.c                     | 41 +++++++++++++++++++++++++++++++++++++----
 mm/mmap.c                      | 14 +++++++++-----
 mm/mremap.c                    |  6 ++++--
 mm/shmem.c                     | 15 ++++++++++++---
 27 files changed, 127 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 90c07ffb7800..d5aa417d8956 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -720,12 +720,13 @@ static unsigned long hugetlbfs_counter(void)
 static int can_do_hugetlb_shm(void)
 {
 	return likely(capable(CAP_IPC_LOCK) ||
-			in_group_p(sysctl_hugetlb_shm_group));
+			in_group_p(sysctl_hugetlb_shm_group) ||
+			can_do_mlock());
 }
 
 struct file *hugetlb_zero_setup(size_t size)
 {
-	int error;
+	int error = -ENOMEM;
 	struct file *file;
 	struct inode *inode;
 	struct dentry *dentry, *root;
@@ -738,6 +739,9 @@ struct file *hugetlb_zero_setup(size_t size)
 	if (!is_hugepage_mem_enough(size))
 		return ERR_PTR(-ENOMEM);
 
+	if (!user_shm_lock(size, current->user))
+		return ERR_PTR(-ENOMEM);
+
 	root = hugetlbfs_vfsmount->mnt_root;
 	snprintf(buf, 16, "%lu", hugetlbfs_counter());
 	quick_string.name = buf;
@@ -745,7 +749,7 @@ struct file *hugetlb_zero_setup(size_t size)
 	quick_string.hash = 0;
 	dentry = d_alloc(root, &quick_string);
 	if (!dentry)
-		return ERR_PTR(-ENOMEM);
+		goto out_shm_unlock;
 
 	error = -ENFILE;
 	file = get_empty_filp();
@@ -772,6 +776,8 @@ out_file:
 	put_filp(file);
 out_dentry:
 	dput(dentry);
+out_shm_unlock:
+	user_shm_unlock(size, current->user);
 	return ERR_PTR(error);
 }
 
diff --git a/include/asm-alpha/resource.h b/include/asm-alpha/resource.h
index b94759c61521..2b0f4bcf2644 100644
--- a/include/asm-alpha/resource.h
+++ b/include/asm-alpha/resource.h
@@ -41,7 +41,7 @@
     {INR_OPEN, INR_OPEN},			/* RLIMIT_NOFILE */	\
     {LONG_MAX, LONG_MAX},			/* RLIMIT_AS */		\
     {LONG_MAX, LONG_MAX},			/* RLIMIT_NPROC */	\
-    {LONG_MAX, LONG_MAX},			/* RLIMIT_MEMLOCK */	\
+    {0, 	0	},			/* RLIMIT_MEMLOCK */	\
     {LONG_MAX, LONG_MAX},			/* RLIMIT_LOCKS */	\
     {MAX_SIGPENDING, MAX_SIGPENDING},		/* RLIMIT_SIGPENDING */ \
     {MQ_BYTES_MAX, MQ_BYTES_MAX},		/* RLIMIT_MSGQUEUE */	\
diff --git a/include/asm-arm/resource.h b/include/asm-arm/resource.h
index 748c660edb15..323167464b97 100644
--- a/include/asm-arm/resource.h
+++ b/include/asm-arm/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },	\
 	{ 0,             0             },	\
 	{ INR_OPEN,      INR_OPEN      },	\
-	{ RLIM_INFINITY, RLIM_INFINITY },	\
+	{ 0,		 0	       },	\
 	{ RLIM_INFINITY, RLIM_INFINITY },	\
 	{ RLIM_INFINITY, RLIM_INFINITY },	\
 	{ MAX_SIGPENDING, MAX_SIGPENDING},	\
diff --git a/include/asm-arm26/resource.h b/include/asm-arm26/resource.h
index 748c660edb15..28a05990277d 100644
--- a/include/asm-arm26/resource.h
+++ b/include/asm-arm26/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },	\
 	{ 0,             0             },	\
 	{ INR_OPEN,      INR_OPEN      },	\
-	{ RLIM_INFINITY, RLIM_INFINITY },	\
+	{ 0,	         0	       },	\
 	{ RLIM_INFINITY, RLIM_INFINITY },	\
 	{ RLIM_INFINITY, RLIM_INFINITY },	\
 	{ MAX_SIGPENDING, MAX_SIGPENDING},	\
diff --git a/include/asm-cris/resource.h b/include/asm-cris/resource.h
index e33ada08d9b8..606a4c9a9579 100644
--- a/include/asm-cris/resource.h
+++ b/include/asm-cris/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },               \
+	{             0,             0 },               \
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-h8300/resource.h b/include/asm-h8300/resource.h
index a87720b14a90..65cf2c6962f2 100644
--- a/include/asm-h8300/resource.h
+++ b/include/asm-h8300/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-i386/resource.h b/include/asm-i386/resource.h
index 3e391b2e941f..47bdff24d040 100644
--- a/include/asm-i386/resource.h
+++ b/include/asm-i386/resource.h
@@ -40,7 +40,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-ia64/resource.h b/include/asm-ia64/resource.h
index 76345b5c14c8..c0a403a8a42e 100644
--- a/include/asm-ia64/resource.h
+++ b/include/asm-ia64/resource.h
@@ -46,7 +46,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-m68k/resource.h b/include/asm-m68k/resource.h
index 8362001c286c..51ef4bbb8e6a 100644
--- a/include/asm-m68k/resource.h
+++ b/include/asm-m68k/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-parisc/resource.h b/include/asm-parisc/resource.h
index 59a446534c5b..ac9de533eb62 100644
--- a/include/asm-parisc/resource.h
+++ b/include/asm-parisc/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-ppc/resource.h b/include/asm-ppc/resource.h
index 3d29914559fc..a8392167ae6e 100644
--- a/include/asm-ppc/resource.h
+++ b/include/asm-ppc/resource.h
@@ -36,7 +36,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-ppc64/resource.h b/include/asm-ppc64/resource.h
index c54e9d69d829..d23ea5ba1b9b 100644
--- a/include/asm-ppc64/resource.h
+++ b/include/asm-ppc64/resource.h
@@ -45,7 +45,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-s390/resource.h b/include/asm-s390/resource.h
index 5f0f2ba958dd..837ed3ab1275 100644
--- a/include/asm-s390/resource.h
+++ b/include/asm-s390/resource.h
@@ -47,7 +47,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{ INR_OPEN, INR_OPEN },                         \
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-sh/resource.h b/include/asm-sh/resource.h
index 73e517a3e80f..690f83a92b21 100644
--- a/include/asm-sh/resource.h
+++ b/include/asm-sh/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-sparc/resource.h b/include/asm-sparc/resource.h
index 58e90f72ca70..098bfa7145f8 100644
--- a/include/asm-sparc/resource.h
+++ b/include/asm-sparc/resource.h
@@ -44,7 +44,7 @@
     {       0, RLIM_INFINITY},		\
     {RLIM_INFINITY, RLIM_INFINITY},	\
     {INR_OPEN, INR_OPEN}, {0, 0},	\
-    {RLIM_INFINITY, RLIM_INFINITY},	\
+    {0, 	     0},	\
     {RLIM_INFINITY, RLIM_INFINITY},	\
     {RLIM_INFINITY, RLIM_INFINITY},	\
     {MAX_SIGPENDING, MAX_SIGPENDING},	\
diff --git a/include/asm-sparc64/resource.h b/include/asm-sparc64/resource.h
index 4a77dd620621..60afa3362b7f 100644
--- a/include/asm-sparc64/resource.h
+++ b/include/asm-sparc64/resource.h
@@ -43,7 +43,7 @@
     {       0, RLIM_INFINITY},		\
     {RLIM_INFINITY, RLIM_INFINITY},	\
     {INR_OPEN, INR_OPEN}, {0, 0},	\
-    {RLIM_INFINITY, RLIM_INFINITY},	\
+    {0, 	     0	          },	\
     {RLIM_INFINITY, RLIM_INFINITY},	\
     {RLIM_INFINITY, RLIM_INFINITY},	\
     {MAX_SIGPENDING, MAX_SIGPENDING},	\
diff --git a/include/asm-v850/resource.h b/include/asm-v850/resource.h
index 9f4ca4ae638f..0b757f33dd92 100644
--- a/include/asm-v850/resource.h
+++ b/include/asm-v850/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/asm-x86_64/resource.h b/include/asm-x86_64/resource.h
index 9628f77179fd..4ed168acafb8 100644
--- a/include/asm-x86_64/resource.h
+++ b/include/asm-x86_64/resource.h
@@ -39,7 +39,7 @@
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{             0,             0 },		\
 	{      INR_OPEN,     INR_OPEN  },		\
-	{ RLIM_INFINITY, RLIM_INFINITY },		\
+	{             0,             0 },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ RLIM_INFINITY, RLIM_INFINITY },		\
 	{ MAX_SIGPENDING, MAX_SIGPENDING },		\
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc8716245294..b7859da6d333 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -496,9 +496,20 @@ int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
 struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 					unsigned long addr);
 struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags);
-void shmem_lock(struct file * file, int lock);
+int shmem_lock(struct file *file, int lock, struct user_struct *user);
 int shmem_zero_setup(struct vm_area_struct *);
 
+static inline int can_do_mlock(void)
+{
+	if (capable(CAP_IPC_LOCK))
+		return 1;
+	if (current->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+		return 1;
+	return 0;
+}
+extern int user_shm_lock(size_t, struct user_struct *);
+extern void user_shm_unlock(size_t, struct user_struct *);
+
 /*
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a01f849da7a5..24066551b966 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -316,6 +316,7 @@ struct user_struct {
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 	/* protected by mq_lock	*/
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
+	unsigned long locked_shm; /* How many pages of mlocked shm ? */
 
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 9a00f5ff6c58..1907355c0eb1 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -84,6 +84,7 @@ struct shmid_kernel /* private to the kernel */
 	time_t			shm_ctim;
 	pid_t			shm_cprid;
 	pid_t			shm_lprid;
+	struct user_struct	*mlock_user;
 };
 
 /* shm_mode upper byte flags */
diff --git a/ipc/shm.c b/ipc/shm.c
index de76c1961367..55dc1ba4229e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -114,7 +114,10 @@ static void shm_destroy (struct shmid_kernel *shp)
 	shm_rmid (shp->id);
 	shm_unlock(shp);
 	if (!is_file_hugepages(shp->shm_file))
-		shmem_lock(shp->shm_file, 0);
+		shmem_lock(shp->shm_file, 0, shp->mlock_user);
+	else
+		user_shm_unlock(shp->shm_file->f_dentry->d_inode->i_size,
+						shp->mlock_user);
 	fput (shp->shm_file);
 	security_shm_free(shp);
 	ipc_rcu_putref(shp);
@@ -190,6 +193,7 @@ static int newseg (key_t key, int shmflg, size_t size)
 
 	shp->shm_perm.key = key;
 	shp->shm_flags = (shmflg & S_IRWXUGO);
+	shp->mlock_user = NULL;
 
 	shp->shm_perm.security = NULL;
 	error = security_shm_alloc(shp);
@@ -198,9 +202,11 @@ static int newseg (key_t key, int shmflg, size_t size)
 		return error;
 	}
 
-	if (shmflg & SHM_HUGETLB)
+	if (shmflg & SHM_HUGETLB) {
+		/* hugetlb_zero_setup takes care of mlock user accounting */
 		file = hugetlb_zero_setup(size);
-	else {
+		shp->mlock_user = current->user;
+	} else {
 		sprintf (name, "SYSV%08x", key);
 		file = shmem_file_setup(name, size, VM_ACCOUNT);
 	}
@@ -504,14 +510,11 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 	{
-/* Allow superuser to lock segment in memory */
-/* Should the pages be faulted in here or leave it to user? */
-/* need to determine interaction with current->swappable */
-		if (!capable(CAP_IPC_LOCK)) {
+		/* Allow superuser to lock segment in memory */
+		if (!can_do_mlock() && cmd == SHM_LOCK) {
 			err = -EPERM;
 			goto out;
 		}
-
 		shp = shm_lock(shmid);
 		if(shp==NULL) {
 			err = -EINVAL;
@@ -526,13 +529,18 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out_unlock;
 		
 		if(cmd==SHM_LOCK) {
-			if (!is_file_hugepages(shp->shm_file))
-				shmem_lock(shp->shm_file, 1);
-			shp->shm_flags |= SHM_LOCKED;
-		} else {
-			if (!is_file_hugepages(shp->shm_file))
-				shmem_lock(shp->shm_file, 0);
+			struct user_struct * user = current->user;
+			if (!is_file_hugepages(shp->shm_file)) {
+				err = shmem_lock(shp->shm_file, 1, user);
+				if (!err) {
+					shp->shm_flags |= SHM_LOCKED;
+					shp->mlock_user = user;
+				}
+			}
+		} else if (!is_file_hugepages(shp->shm_file)) {
+			shmem_lock(shp->shm_file, 0, shp->mlock_user);
 			shp->shm_flags &= ~SHM_LOCKED;
+			shp->mlock_user = NULL;
 		}
 		shm_unlock(shp);
 		goto out;
diff --git a/kernel/user.c b/kernel/user.c
index 9f9859ef88ea..523175afeecd 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -32,7 +32,8 @@ struct user_struct root_user = {
 	.processes	= ATOMIC_INIT(1),
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
-	.mq_bytes	= 0
+	.mq_bytes	= 0,
+	.locked_shm     = 0,
 };
 
 /*
@@ -113,6 +114,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		atomic_set(&new->sigpending, 0);
 
 		new->mq_bytes = 0;
+		new->locked_shm = 0;
 
 		/*
 		 * Before adding this, check whether we raced
diff --git a/mm/mlock.c b/mm/mlock.c
index a9e37161dcef..b428752c6187 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -60,7 +60,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
 	struct vm_area_struct * vma, * next;
 	int error;
 
-	if (on && !capable(CAP_IPC_LOCK))
+	if (on && !can_do_mlock())
 		return -EPERM;
 	len = PAGE_ALIGN(len);
 	end = start + len;
@@ -118,7 +118,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
 	lock_limit >>= PAGE_SHIFT;
 
 	/* check against resource limits */
-	if (locked <= lock_limit)
+	if ( (locked <= lock_limit) || capable(CAP_IPC_LOCK))
 		error = do_mlock(start, len, 1);
 	up_write(&current->mm->mmap_sem);
 	return error;
@@ -142,7 +142,7 @@ static int do_mlockall(int flags)
 	unsigned int def_flags;
 	struct vm_area_struct * vma;
 
-	if (!capable(CAP_IPC_LOCK))
+	if (!can_do_mlock())
 		return -EPERM;
 
 	def_flags = 0;
@@ -177,7 +177,7 @@ asmlinkage long sys_mlockall(int flags)
 	lock_limit >>= PAGE_SHIFT;
 
 	ret = -ENOMEM;
-	if (current->mm->total_vm <= lock_limit)
+	if ((current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK))
 		ret = do_mlockall(flags);
 out:
 	up_write(&current->mm->mmap_sem);
@@ -193,3 +193,36 @@ asmlinkage long sys_munlockall(void)
 	up_write(&current->mm->mmap_sem);
 	return ret;
 }
+
+/*
+ * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
+ * shm segments) get accounted against the user_struct instead.
+ */
+static spinlock_t shmlock_user_lock = SPIN_LOCK_UNLOCKED;
+
+int user_shm_lock(size_t size, struct user_struct *user)
+{
+	unsigned long lock_limit, locked;
+	int allowed = 0;
+
+	spin_lock(&shmlock_user_lock);
+	locked = size >> PAGE_SHIFT;
+	lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+	if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+		goto out;
+	get_uid(user);
+	user->locked_shm += locked;
+	allowed = 1;
+out:
+	spin_unlock(&shmlock_user_lock);
+	return allowed;
+}
+
+void user_shm_unlock(size_t size, struct user_struct *user)
+{
+	spin_lock(&shmlock_user_lock);
+	user->locked_shm -= (size >> PAGE_SHIFT);
+	spin_unlock(&shmlock_user_lock);
+	free_uid(user);
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f7495c75228..04dc9e284918 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -796,15 +796,17 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
 	if (flags & MAP_LOCKED) {
-		if (!capable(CAP_IPC_LOCK))
+		if (!can_do_mlock())
 			return -EPERM;
 		vm_flags |= VM_LOCKED;
 	}
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
-		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
+		unsigned long locked, lock_limit;
+		locked = mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += len;
-		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 
@@ -1625,9 +1627,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	 * mlock MCL_FUTURE?
 	 */
 	if (mm->def_flags & VM_LOCKED) {
-		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
+		unsigned long locked, lock_limit;
+		locked = mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += len;
-		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 984b8ddbd218..6be63314688f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -324,10 +324,12 @@ unsigned long do_mremap(unsigned long addr,
 			goto out;
 	}
 	if (vma->vm_flags & VM_LOCKED) {
-		unsigned long locked = current->mm->locked_vm << PAGE_SHIFT;
+		unsigned long locked, lock_limit;
+		locked = current->mm->locked_vm << PAGE_SHIFT;
+		lock_limit = current->rlim[RLIMIT_MEMLOCK].rlim_cur;
 		locked += new_len - old_len;
 		ret = -EAGAIN;
-		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			goto out;
 	}
 	ret = -ENOMEM;
diff --git a/mm/shmem.c b/mm/shmem.c
index 8ca20e04b60c..c3b4cc5d59ff 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1151,17 +1151,26 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
 }
 #endif
 
-void shmem_lock(struct file *file, int lock)
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	int retval = -ENOMEM;
 
 	spin_lock(&info->lock);
-	if (lock)
+	if (lock && !(info->flags & VM_LOCKED)) {
+		if (!user_shm_lock(inode->i_size, user))
+			goto out_nomem;
 		info->flags |= VM_LOCKED;
-	else
+	}
+	if (!lock && (info->flags & VM_LOCKED) && user) {
+		user_shm_unlock(inode->i_size, user);
 		info->flags &= ~VM_LOCKED;
+	}
+	retval = 0;
+out_nomem:
 	spin_unlock(&info->lock);
+	return retval;
 }
 
 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
-- 
cgit v1.2.3