summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2005-01-07 21:49:02 -0800
committerLinus Torvalds <torvalds@evo.osdl.org>2005-01-07 21:49:02 -0800
commit38e387ee01e5a57cd3ed84062930997b87fa3896 (patch)
treec3cbc19de0beeceb82408b03a27784e9e44ee701 /include/linux
parent18f27594d0c5cd2da683252afc8d0933bd64a365 (diff)
[PATCH] improve preemption on SMP
SMP locking latencies are one of the last architectural problems that cause millisec-category scheduling delays. CONFIG_PREEMPT tries to solve some of the SMP issues but there are still lots of problems remaining: spinlocks nested at multiple levels, spinning with irqs turned off, and non-nested spinning with preemption turned off permanently. The nesting problem goes like this: if a piece of kernel code (e.g. the MM or ext3's journalling code) does the following: spin_lock(&spinlock_1); ... spin_lock(&spinlock_2); ... then even with CONFIG_PREEMPT enabled, current kernels may spin on spinlock_2 indefinitely. A number of critical sections break their long paths by using cond_resched_lock(), but this does not break the path on SMP, because need_resched() *of the other CPU* is not set so cond_resched_lock() doesnt notice that a reschedule is due. to solve this problem i've introduced a new spinlock field, lock->break_lock, which signals towards the holding CPU that a spinlock-break is requested by another CPU. This field is only set if a CPU is spinning in a spinlock function [at any locking depth], so the default overhead is zero. I've extended cond_resched_lock() to check for this flag - in this case we can also save a reschedule. I've added the lock_need_resched(lock) and need_lockbreak(lock) methods to check for the need to break out of a critical section. Another latency problem was that the stock kernel, even with CONFIG_PREEMPT enabled, didnt have any spin-nicely preemption logic for the following, commonly used SMP locking primitives: read_lock(), spin_lock_irqsave(), spin_lock_irq(), spin_lock_bh(), read_lock_irqsave(), read_lock_irq(), read_lock_bh(), write_lock_irqsave(), write_lock_irq(), write_lock_bh(). Only spin_lock() and write_lock() [the two simplest cases] where covered. In addition to the preemption latency problems, the _irq() variants in the above list didnt do any IRQ-enabling while spinning - possibly resulting in excessive irqs-off sections of code! preempt-smp.patch fixes all these latency problems by spinning irq-nicely (if possible) and by requesting lock-breaks if needed. Two architecture-level changes were necessary for this: the addition of the break_lock field to spinlock_t and rwlock_t, and the addition of the _raw_read_trylock() function. Testing done by Mark H Johnson and myself indicate SMP latencies comparable to the UP kernel - while they were basically indefinitely high without this patch. i successfully test-compiled and test-booted this patch ontop of BK-curr using the following .config combinations: SMP && PREEMPT, !SMP && PREEMPT, SMP && !PREEMPT and !SMP && !PREEMPT on x86, !SMP && !PREEMPT and SMP && PREEMPT on x64. I also test-booted x86 with the generic_read_trylock function to check that it works fine. Essentially the same patch has been in testing as part of the voluntary-preempt patches for some time already. NOTE to architecture maintainers: generic_raw_read_trylock() is a crude version that should be replaced with the proper arch-optimized version ASAP. From: Hugh Dickins <hugh@veritas.com> The i386 and x86_64 _raw_read_trylocks in preempt-smp.patch are too successful: atomic_read() returns a signed integer. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/sched.h18
-rw-r--r--include/linux/spinlock.h23
2 files changed, 18 insertions, 23 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26ee93b76485..425ee5e7c4b1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1072,23 +1072,7 @@ static inline void cond_resched(void)
__cond_resched();
}
-/*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-static inline void cond_resched_lock(spinlock_t * lock)
-{
- if (need_resched()) {
- _raw_spin_unlock(lock);
- preempt_enable_no_resched();
- __cond_resched();
- spin_lock(lock);
- }
-}
+extern int cond_resched_lock(spinlock_t * lock);
/* Reevaluate whether the task has signals pending delivery.
This is required every time the blocked sigset_t changes.
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 027c06f522bd..d6a455ed2dcb 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -41,6 +41,7 @@
#include <asm/spinlock.h>
int __lockfunc _spin_trylock(spinlock_t *lock);
+int __lockfunc _read_trylock(rwlock_t *lock);
int __lockfunc _write_trylock(rwlock_t *lock);
void __lockfunc _spin_lock(spinlock_t *lock) __acquires(spinlock_t);
@@ -73,6 +74,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(rwlock_t);
void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(rwlock_t);
int __lockfunc _spin_trylock_bh(spinlock_t *lock);
+int __lockfunc generic_raw_read_trylock(rwlock_t *lock);
int in_lock_functions(unsigned long addr);
#else
@@ -219,11 +221,15 @@ typedef struct {
#define _raw_read_unlock(lock) do { (void)(lock); } while(0)
#define _raw_write_lock(lock) do { (void)(lock); } while(0)
#define _raw_write_unlock(lock) do { (void)(lock); } while(0)
+#define _raw_read_trylock(lock) ({ (void)(lock); (1); })
#define _raw_write_trylock(lock) ({ (void)(lock); (1); })
#define _spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \
1 : ({preempt_enable(); 0;});})
+#define _read_trylock(lock) ({preempt_disable();_raw_read_trylock(lock) ? \
+ 1 : ({preempt_enable(); 0;});})
+
#define _write_trylock(lock) ({preempt_disable(); _raw_write_trylock(lock) ? \
1 : ({preempt_enable(); 0;});})
@@ -425,16 +431,12 @@ do { \
* methods are defined as nops in the case they are not required.
*/
#define spin_trylock(lock) __cond_lock(_spin_trylock(lock))
+#define read_trylock(lock) __cond_lock(_read_trylock(lock))
#define write_trylock(lock) __cond_lock(_write_trylock(lock))
-/* Where's read_trylock? */
-
#define spin_lock(lock) _spin_lock(lock)
#define write_lock(lock) _write_lock(lock)
#define read_lock(lock) _read_lock(lock)
-#define spin_unlock(lock) _spin_unlock(lock)
-#define write_unlock(lock) _write_unlock(lock)
-#define read_unlock(lock) _read_unlock(lock)
#ifdef CONFIG_SMP
#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock)
@@ -454,6 +456,11 @@ do { \
#define write_lock_irq(lock) _write_lock_irq(lock)
#define write_lock_bh(lock) _write_lock_bh(lock)
+
+#define spin_unlock(lock) _spin_unlock(lock)
+#define write_unlock(lock) _write_unlock(lock)
+#define read_unlock(lock) _read_unlock(lock)
+
#define spin_unlock_irqrestore(lock, flags) _spin_unlock_irqrestore(lock, flags)
#define spin_unlock_irq(lock) _spin_unlock_irq(lock)
#define spin_unlock_bh(lock) _spin_unlock_bh(lock)
@@ -490,6 +497,7 @@ extern void _metered_read_lock (rwlock_t *lock);
extern void _metered_read_unlock (rwlock_t *lock);
extern void _metered_write_lock (rwlock_t *lock);
extern void _metered_write_unlock (rwlock_t *lock);
+extern int _metered_read_trylock (rwlock_t *lock);
extern int _metered_write_trylock(rwlock_t *lock);
#endif
@@ -519,8 +527,11 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr)
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
while (test_and_set_bit(bitnum, addr)) {
- while (test_bit(bitnum, addr))
+ while (test_bit(bitnum, addr)) {
+ preempt_enable();
cpu_relax();
+ preempt_disable();
+ }
}
#endif
__acquire(bitlock);