[PATCH] remove the BKL by turning it into a semaphore

This is the current remove-BKL patch. I test-booted it on x86 and x64, trying every conceivable combination of SMP, PREEMPT and PREEMPT_BKL. All other architectures should compile as well. (most of the testing was done with the zaphod patch undone but it applies cleanly on vanilla -mm3 as well and should work fine.) this is the debugging-enabled variant of the patch which has two main debugging features: - debug potentially illegal smp_processor_id() use. Has caught a number of real bugs - e.g. look at the printk.c fix in the patch. - make it possible to enable/disable the BKL via a .config. If this goes upstream we dont want this of course, but for now it gives people a chance to find out whether any particular problem was caused by this patch. This patch has one important fix over the previous BKL patch: on PREEMPT kernels if we preempted BKL-using code then the code still auto-dropped the BKL by mistake. This caused a number of breakages for testers, which breakages went away once this bug was fixed. Also the debugging mechanism has been improved alot relative to the previous BKL patch. Would be nice to test-drive this in -mm. There will likely be some more smp_processor_id() false positives but they are 1) harmless 2) easy to fix up. We could as well find more real smp_processor_id() related breakages as well. The most noteworthy fact is that no BKL-using code was found yet that relied on smp_processor_id(), which is promising from a compatibility POV. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Ingo Molnar <mingo@elte.hu> 2005-01-07 21:59:57 -0800
committer: Linus Torvalds <torvalds@evo.osdl.org> 2005-01-07 21:59:57 -0800
commit: fb8f6499abc6a847109d9602b797aa6afd2d5a3d (patch)
tree: 9b23f9dde8826bb5df266ce9be81c1d51c6e804a
parent: 8a1a48b7cd80de98d4d07ee1e78311a88c738335 (diff)
26 files changed, 321 insertions, 58 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 2f498bfc1dfc..cae9d38b5c9f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -517,6 +517,17 @@ config PREEMPT
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
 
+config PREEMPT_BKL
+	bool "Preempt The Big Kernel Lock"
+	depends on PREEMPT || SMP
+	default y
+	help
+	  This option reduces the latency of the kernel by making the
+	  big kernel lock preemptible.
+
+	  Say Y here if you are building a kernel for a desktop system.
+	  Say N if you are unsure.
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors" if !SMP
 	depends on !(X86_VISWS || X86_VOYAGER)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 7b79a9b6a04f..b5db26cbde87 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -145,7 +145,7 @@ static void poll_idle (void)
  */
 void cpu_idle (void)
 {
-	int cpu = smp_processor_id();
+	int cpu = _smp_processor_id();
 
 	/* endless idle loop with no priority at all */
 	while (1) {
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index b3e28427de97..71553a8dbe54 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -306,7 +306,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	};
 	static int die_counter;
 
-	if (die.lock_owner != smp_processor_id()) {
+	if (die.lock_owner != _smp_processor_id()) {
 		console_verbose();
 		spin_lock_irq(&die.lock);
 		die.lock_owner = smp_processor_id();
diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
index 97b97af9c5e9..080639f262b1 100644
--- a/arch/i386/lib/delay.c
+++ b/arch/i386/lib/delay.c
@@ -34,7 +34,7 @@ inline void __const_udelay(unsigned long xloops)
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (current_cpu_data.loops_per_jiffy * (HZ/4)));
+		:"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4)));
         __delay(++xloops);
 }
 
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
index 5504546494c8..50b36037d86b 100644
--- a/arch/sh/lib/delay.c
+++ b/arch/sh/lib/delay.c
@@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long xloops)
 	__asm__("dmulu.l	%0, %2\n\t"
 		"sts	mach, %0"
 		: "=r" (xloops)
-		: "0" (xloops), "r" (current_cpu_data.loops_per_jiffy)
+		: "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
 		: "macl", "mach");
 	__delay(xloops * HZ);
 }
diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c
index f55f528a7f6b..f6b4c784d53e 100644
--- a/arch/sparc64/lib/delay.c
+++ b/arch/sparc64/lib/delay.c
@@ -31,7 +31,7 @@ void __const_udelay(unsigned long n)
 {
 	n *= 4;
 
-	n *= (cpu_data(smp_processor_id()).udelay_val * (HZ/4));
+	n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4));
 	n >>= 32;
 
 	__delay(n + 1);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index c0972864ac7d..47339a83c995 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -249,6 +249,17 @@ config PREEMPT
 	  Say Y here if you are feeling brave and building a kernel for a
 	  desktop, embedded or real-time system.  Say N if you are unsure.
 
+config PREEMPT_BKL
+	bool "Preempt The Big Kernel Lock"
+	depends on PREEMPT || SMP
+	default y
+	help
+	  This option reduces the latency of the kernel by making the
+	  big kernel lock preemptible.
+
+	  Say Y here if you are building a kernel for a desktop system.
+	  Say N if you are unsure.
+
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 9fcdb2d2d4cd..05689cc54901 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -34,7 +34,7 @@ void __delay(unsigned long loops)
 
 inline void __const_udelay(unsigned long xloops)
 {
-        __delay(((xloops * current_cpu_data.loops_per_jiffy) >> 32) * HZ);
+	__delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
 }
 
 void __udelay(unsigned long usecs)
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index ba7eccc97f19..a1e2a1de8237 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -50,7 +50,7 @@ extern u8 x86_cpu_to_apicid[];
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define smp_processor_id() (current_thread_info()->cpu)
+#define __smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 #define cpu_possible_map cpu_callout_map
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index a0229ff0baa2..dd354e282a92 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -66,7 +66,7 @@ static inline int num_booting_cpus(void)
 	return cpus_weight(cpu_callout_map);
 }
 
-#define smp_processor_id() read_pda(cpunumber)
+#define __smp_processor_id() read_pda(cpunumber)
 
 extern __inline int hard_smp_processor_id(void)
 {
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 833216955f4a..ba0fcb34c8cd 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,12 +61,16 @@
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
+#else
+# define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+#endif
+
+#ifdef CONFIG_PREEMPT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
-# define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
 # define preemptible()	0
 # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET
 #endif
@@ -77,10 +81,10 @@ extern void synchronize_irq(unsigned int irq);
 # define synchronize_irq(irq)	barrier()
 #endif
 
-#define nmi_enter()		(preempt_count() += HARDIRQ_OFFSET)
-#define nmi_exit()		(preempt_count() -= HARDIRQ_OFFSET)
+#define nmi_enter()		irq_enter()
+#define nmi_exit()		sub_preempt_count(HARDIRQ_OFFSET)
 
-#define irq_enter()		(preempt_count() += HARDIRQ_OFFSET)
+#define irq_enter()		add_preempt_count(HARDIRQ_OFFSET)
 extern void irq_exit(void);
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6ef18a885b96..991831cff1da 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -70,9 +70,9 @@ extern void enable_irq(unsigned int irq);
 
 /* SoftIRQ primitives.  */
 #define local_bh_disable() \
-		do { preempt_count() += SOFTIRQ_OFFSET; barrier(); } while (0)
+		do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0)
 #define __local_bh_enable() \
-		do { barrier(); preempt_count() -= SOFTIRQ_OFFSET; } while (0)
+		do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0)
 
 extern void local_bh_enable(void);
 
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index a7ad90136d64..dd98c54a23b4 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -9,17 +9,18 @@
 #include <linux/config.h>
 #include <linux/linkage.h>
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+#ifdef CONFIG_DEBUG_PREEMPT
+  extern void fastcall add_preempt_count(int val);
+  extern void fastcall sub_preempt_count(int val);
+#else
+# define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
+# define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+#endif
 
-#define inc_preempt_count() \
-do { \
-	preempt_count()++; \
-} while (0)
+#define inc_preempt_count() add_preempt_count(1)
+#define dec_preempt_count() sub_preempt_count(1)
 
-#define dec_preempt_count() \
-do { \
-	preempt_count()--; \
-} while (0)
+#define preempt_count()	(current_thread_info()->preempt_count)
 
 #ifdef CONFIG_PREEMPT
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a4ca8abdbedb..c438ec9880e9 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -97,8 +97,10 @@ void smp_prepare_boot_cpu(void);
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
- 
-#define smp_processor_id()			0
+
+#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT)
+# define smp_processor_id()			0
+#endif
 #define hard_smp_processor_id()			0
 #define smp_threads_ready			1
 #define smp_call_function(func,info,retry,wait)	({ 0; })
@@ -109,6 +111,33 @@ static inline void smp_send_reschedule(int cpu) { }
 
 #endif /* !SMP */
 
+/*
+ * DEBUG_PREEMPT support: check whether smp_processor_id() is being
+ * used in a preemption-safe way.
+ *
+ * An architecture has to enable this debugging code explicitly.
+ * It can do so by renaming the smp_processor_id() macro to
+ * __smp_processor_id().  This should only be done after some minimal
+ * testing, because usually there are a number of false positives
+ * that an architecture will trigger.
+ *
+ * To fix a false positive (i.e. smp_processor_id() use that the
+ * debugging code reports but which use for some reason is legal),
+ * change the smp_processor_id() reference to _smp_processor_id(),
+ * which is the nondebug variant.  NOTE: don't use this to hack around
+ * real bugs.
+ */
+#ifdef __smp_processor_id
+# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+   extern unsigned int smp_processor_id(void);
+# else
+#  define smp_processor_id() __smp_processor_id()
+# endif
+# define _smp_processor_id() __smp_processor_id()
+#else
+# define _smp_processor_id() smp_processor_id()
+#endif
+
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 #define put_cpu_no_resched()	preempt_enable_no_resched()
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index 8a142e0311a2..b63ce7014093 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -9,15 +9,15 @@
 
 #define kernel_locked()		(current->lock_depth >= 0)
 
-extern int __lockfunc get_kernel_lock(void);
-extern void __lockfunc put_kernel_lock(void);
+extern int __lockfunc __reacquire_kernel_lock(void);
+extern void __lockfunc __release_kernel_lock(void);
 
 /*
  * Release/re-acquire global kernel lock for the scheduler
  */
 #define release_kernel_lock(tsk) do { 		\
 	if (unlikely((tsk)->lock_depth >= 0))	\
-		put_kernel_lock();		\
+		__release_kernel_lock();	\
 } while (0)
 
 /*
@@ -26,16 +26,16 @@ extern void __lockfunc put_kernel_lock(void);
  * reacquire_kernel_lock() so that the compiler can see
  * it at compile-time.
  */
-#ifdef CONFIG_SMP
-#define return_value_on_smp return
+#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_BKL)
+# define return_value_on_smp return
 #else
-#define return_value_on_smp
+# define return_value_on_smp
 #endif
 
 static inline int reacquire_kernel_lock(struct task_struct *task)
 {
 	if (unlikely(task->lock_depth >= 0))
-		return_value_on_smp get_kernel_lock();
+		return_value_on_smp __reacquire_kernel_lock();
 	return 0;
 }
 
diff --git a/include/net/route.h b/include/net/route.h
index 5e0100185d95..6228a91777dc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -105,7 +105,7 @@ struct rt_cache_stat
 
 extern struct rt_cache_stat *rt_cache_stat;
 #define RT_CACHE_STAT_INC(field)					  \
-		(per_cpu_ptr(rt_cache_stat, smp_processor_id())->field++)
+		(per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++)
 
 extern struct ip_rt_acct *ip_rt_acct;
 
diff --git a/include/net/snmp.h b/include/net/snmp.h
index b7068876e0eb..a15ab256276e 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -128,18 +128,18 @@ struct linux_mib {
 #define SNMP_STAT_USRPTR(name)	(name[1])
 
 #define SNMP_INC_STATS_BH(mib, field) 	\
-	(per_cpu_ptr(mib[0], smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset)	\
-	(per_cpu_ptr(mib[0], smp_processor_id())->mibs[field + (offset)]++)
+	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++)
 #define SNMP_INC_STATS_USER(mib, field) \
-	(per_cpu_ptr(mib[1], smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++)
 #define SNMP_DEC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], smp_processor_id())->mibs[field]--)
+	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--)
 #define SNMP_ADD_STATS_BH(mib, field, addend) 	\
-	(per_cpu_ptr(mib[0], smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend) 	\
-	(per_cpu_ptr(mib[1], smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend)
 
 #endif
diff --git a/init/main.c b/init/main.c
index af4f0a75c8a5..6b273a7e274d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -444,6 +444,10 @@ asmlinkage void __init start_kernel(void)
 	 * time - but meanwhile we still have a functioning scheduler.
 	 */
 	sched_init();
+	/*
+	 * Disable preemption - early bootup scheduling is extremely
+	 * fragile until we cpu_idle() for the first time.
+	 */
 	preempt_disable();
 	build_all_zonelists();
 	page_alloc_init();
diff --git a/kernel/module.c b/kernel/module.c
index 887ed2510105..5a0f2024a9e4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod)
 	for (i = 0; i < NR_CPUS; i++)
 		local_set(&mod->ref[i].count, 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[smp_processor_id()].count, 1);
+	local_set(&mod->ref[_smp_processor_id()].count, 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index d59c715677cb..eece774b4d19 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -645,8 +645,9 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
-		spin_unlock_irqrestore(&logbuf_lock, flags);
+		spin_unlock(&logbuf_lock);
 		call_console_drivers(_con_start, _log_end);
+		local_irq_restore(flags);
 	}
 	console_locked = 0;
 	console_may_schedule = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index a06ed79d562e..4f88a2c04e9e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2506,6 +2506,38 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 }
 #endif
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+
+void fastcall add_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count() < 0));
+	preempt_count() += val;
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void fastcall sub_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(val > preempt_count());
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+	preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
 /*
  * schedule() is the main scheduler function.
  */
@@ -2688,7 +2720,10 @@ EXPORT_SYMBOL(schedule);
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
-
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
@@ -2697,9 +2732,21 @@ asmlinkage void __sched preempt_schedule(void)
 		return;
 
 need_resched:
-	ti->preempt_count = PREEMPT_ACTIVE;
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
 	schedule();
-	ti->preempt_count = 0;
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	sub_preempt_count(PREEMPT_ACTIVE);
 
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
@@ -3436,9 +3483,9 @@ asmlinkage long sys_sched_yield(void)
 static inline void __cond_resched(void)
 {
 	do {
-		preempt_count() += PREEMPT_ACTIVE;
+		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
-		preempt_count() -= PREEMPT_ACTIVE;
+		sub_preempt_count(PREEMPT_ACTIVE);
 	} while (need_resched());
 }
 
@@ -3522,7 +3569,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct runqueue *rq = this_rq();
+	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
 
 	atomic_inc(&rq->nr_iowait);
 	schedule();
@@ -3533,7 +3580,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct runqueue *rq = this_rq();
+	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
 	long ret;
 
 	atomic_inc(&rq->nr_iowait);
@@ -3741,7 +3788,7 @@ void __devinit init_idle(task_t *idle, int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
 	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
 #else
 	idle->thread_info->preempt_count = 0;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8d6e9055e2b9..9e2d204da364 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,7 +142,7 @@ void local_bh_enable(void)
 	 * Keep preemption disabled until we are done with
 	 * softirq processing:
  	 */
-	preempt_count() -= SOFTIRQ_OFFSET - 1;
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
 
 	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		do_softirq();
@@ -163,7 +163,7 @@ EXPORT_SYMBOL(local_bh_enable);
  */
 void irq_exit(void)
 {
-	preempt_count() -= IRQ_EXIT_OFFSET;
+	sub_preempt_count(IRQ_EXIT_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 	preempt_enable_no_resched();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2ceea25f67f6..e31b1cb8e503 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -95,7 +95,7 @@ static int stop_machine(void)
 	stopmachine_state = STOPMACHINE_WAIT;
 
 	for_each_online_cpu(i) {
-		if (i == smp_processor_id())
+		if (i == _smp_processor_id())
 			continue;
 		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
 		if (ret < 0)
@@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
 	if (cpu == NR_CPUS)
-		cpu = smp_processor_id();
+		cpu = _smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
 	if (!IS_ERR(p)) {
diff --git a/kernel/timer.c b/kernel/timer.c
index d0177a7f573a..ee1aa2ec7fe4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -465,7 +465,14 @@ repeat:
 			smp_wmb();
 			timer->base = NULL;
 			spin_unlock_irq(&base->lock);
-			fn(data);
+			{
+				u32 preempt_count = preempt_count();
+				fn(data);
+				if (preempt_count != preempt_count()) {
+					printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
+					BUG();
+				}
+			}
 			spin_lock_irq(&base->lock);
 			goto repeat;
 		}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d9da4ec792b4..850bb10ec91e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -48,6 +48,16 @@ config DEBUG_SLAB
 	  allocation as well as poisoning memory on free to catch use of freed
 	  memory. This can make kmalloc/kfree-intensive workloads much slower.
 
+config DEBUG_PREEMPT
+	bool "Debug preemptible kernel"
+	depends on PREEMPT && X86
+	default y
+	help
+	  If you say Y here then the kernel will use a debug variant of the
+	  commonly used smp_processor_id() function and will print warnings
+	  if kernel code uses it in a preemption-unsafe way. Also, the kernel
+	  will detect preemption count underflows.
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL && (ALPHA || ARM || X86 || IA64 || M32R || MIPS || PARISC || PPC32 || (SUPERH && !SUPERH64) || SPARC32 || SPARC64 || USERMODE || X86_64)
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 48dc05a13963..8819c12a74ee 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -7,6 +7,141 @@
  */
 #include <linux/smp_lock.h>
 #include <linux/module.h>
+#include <linux/kallsyms.h>
+
+#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \
+		defined(CONFIG_DEBUG_PREEMPT)
+
+/*
+ * Debugging check.
+ */
+unsigned int smp_processor_id(void)
+{
+	unsigned long preempt_count = preempt_count();
+	int this_cpu = __smp_processor_id();
+	cpumask_t this_mask;
+
+	if (likely(preempt_count))
+		goto out;
+
+	if (irqs_disabled())
+		goto out;
+
+	/*
+	 * Kernel threads bound to a single CPU can safely use
+	 * smp_processor_id():
+	 */
+	this_mask = cpumask_of_cpu(this_cpu);
+
+	if (cpus_equal(current->cpus_allowed, this_mask))
+		goto out;
+
+	/*
+	 * It is valid to assume CPU-locality during early bootup:
+	 */
+	if (system_state != SYSTEM_RUNNING)
+		goto out;
+
+	/*
+	 * Avoid recursion:
+	 */
+	preempt_disable();
+
+	if (!printk_ratelimit())
+		goto out_enable;
+
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
+	dump_stack();
+
+out_enable:
+	preempt_enable_no_resched();
+out:
+	return this_cpu;
+}
+
+EXPORT_SYMBOL(smp_processor_id);
+
+#endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */
+
+#ifdef CONFIG_PREEMPT_BKL
+/*
+ * The 'big kernel semaphore'
+ *
+ * This mutex is taken and released recursively by lock_kernel()
+ * and unlock_kernel().  It is transparently dropped and reaquired
+ * over schedule().  It is used to protect legacy code that hasn't
+ * been migrated to a proper locking design yet.
+ *
+ * Note: code locked by this semaphore will only be serialized against
+ * other code using the same locking facility. The code guarantees that
+ * the task remains on the same CPU.
+ *
+ * Don't use in new code.
+ */
+DECLARE_MUTEX(kernel_sem);
+
+/*
+ * Re-acquire the kernel semaphore.
+ *
+ * This function is called with preemption off.
+ *
+ * We are executing in schedule() so the code must be extremely careful
+ * about recursion, both due to the down() and due to the enabling of
+ * preemption. schedule() will re-check the preemption flag after
+ * reacquiring the semaphore.
+ */
+int __lockfunc __reacquire_kernel_lock(void)
+{
+	struct task_struct *task = current;
+	int saved_lock_depth = task->lock_depth;
+
+	BUG_ON(saved_lock_depth < 0);
+
+	task->lock_depth = -1;
+	preempt_enable_no_resched();
+
+	down(&kernel_sem);
+
+	preempt_disable();
+	task->lock_depth = saved_lock_depth;
+
+	return 0;
+}
+
+void __lockfunc __release_kernel_lock(void)
+{
+	up(&kernel_sem);
+}
+
+/*
+ * Getting the big kernel semaphore.
+ */
+void __lockfunc lock_kernel(void)
+{
+	struct task_struct *task = current;
+	int depth = task->lock_depth + 1;
+
+	if (likely(!depth))
+		/*
+		 * No recursion worries - we set up lock_depth _after_
+		 */
+		down(&kernel_sem);
+
+	task->lock_depth = depth;
+}
+
+void __lockfunc unlock_kernel(void)
+{
+	struct task_struct *task = current;
+
+	BUG_ON(task->lock_depth < 0);
+
+	if (likely(--task->lock_depth < 0))
+		up(&kernel_sem);
+}
+
+#else
 
 /*
  * The 'big kernel lock'
@@ -34,7 +169,7 @@ static spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  * (This works on UP too - _raw_spin_trylock will never
  * return false in that case)
  */
-int __lockfunc get_kernel_lock(void)
+int __lockfunc __reacquire_kernel_lock(void)
 {
 	while (!_raw_spin_trylock(&kernel_flag)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
@@ -45,7 +180,7 @@ int __lockfunc get_kernel_lock(void)
 	return 0;
 }
 
-void __lockfunc put_kernel_lock(void)
+void __lockfunc __release_kernel_lock(void)
 {
 	_raw_spin_unlock(&kernel_flag);
 	preempt_enable_no_resched();
@@ -122,5 +257,8 @@ void __lockfunc unlock_kernel(void)
 		__unlock_kernel();
 }
 
+#endif
+
 EXPORT_SYMBOL(lock_kernel);
 EXPORT_SYMBOL(unlock_kernel);
+
author	Ingo Molnar <mingo@elte.hu>	2005-01-07 21:59:57 -0800
committer	Linus Torvalds <torvalds@evo.osdl.org>	2005-01-07 21:59:57 -0800
commit	fb8f6499abc6a847109d9602b797aa6afd2d5a3d (patch)
tree	9b23f9dde8826bb5df266ce9be81c1d51c6e804a
parent	8a1a48b7cd80de98d4d07ee1e78311a88c738335 (diff)