diff options
| author | Ingo Molnar <mingo@elte.hu> | 2005-01-07 21:49:02 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@evo.osdl.org> | 2005-01-07 21:49:02 -0800 |
| commit | 38e387ee01e5a57cd3ed84062930997b87fa3896 (patch) | |
| tree | c3cbc19de0beeceb82408b03a27784e9e44ee701 /include/linux | |
| parent | 18f27594d0c5cd2da683252afc8d0933bd64a365 (diff) | |
[PATCH] improve preemption on SMP
SMP locking latencies are one of the last architectural problems that cause
millisec-category scheduling delays. CONFIG_PREEMPT tries to solve some of
the SMP issues but there are still lots of problems remaining: spinlocks
nested at multiple levels, spinning with irqs turned off, and non-nested
spinning with preemption turned off permanently.
The nesting problem goes like this: if a piece of kernel code (e.g. the MM
or ext3's journalling code) does the following:
spin_lock(&spinlock_1);
...
spin_lock(&spinlock_2);
...
then even with CONFIG_PREEMPT enabled, current kernels may spin on
spinlock_2 indefinitely. A number of critical sections break their long
paths by using cond_resched_lock(), but this does not break the path on
SMP, because need_resched() *of the other CPU* is not set so
cond_resched_lock() doesnt notice that a reschedule is due.
to solve this problem i've introduced a new spinlock field,
lock->break_lock, which signals towards the holding CPU that a
spinlock-break is requested by another CPU. This field is only set if a
CPU is spinning in a spinlock function [at any locking depth], so the
default overhead is zero. I've extended cond_resched_lock() to check for
this flag - in this case we can also save a reschedule. I've added the
lock_need_resched(lock) and need_lockbreak(lock) methods to check for the
need to break out of a critical section.
Another latency problem was that the stock kernel, even with CONFIG_PREEMPT
enabled, didnt have any spin-nicely preemption logic for the following,
commonly used SMP locking primitives: read_lock(), spin_lock_irqsave(),
spin_lock_irq(), spin_lock_bh(), read_lock_irqsave(), read_lock_irq(),
read_lock_bh(), write_lock_irqsave(), write_lock_irq(), write_lock_bh().
Only spin_lock() and write_lock() [the two simplest cases] where covered.
In addition to the preemption latency problems, the _irq() variants in the
above list didnt do any IRQ-enabling while spinning - possibly resulting in
excessive irqs-off sections of code!
preempt-smp.patch fixes all these latency problems by spinning irq-nicely
(if possible) and by requesting lock-breaks if needed. Two
architecture-level changes were necessary for this: the addition of the
break_lock field to spinlock_t and rwlock_t, and the addition of the
_raw_read_trylock() function.
Testing done by Mark H Johnson and myself indicate SMP latencies comparable
to the UP kernel - while they were basically indefinitely high without this
patch.
i successfully test-compiled and test-booted this patch ontop of BK-curr
using the following .config combinations: SMP && PREEMPT, !SMP && PREEMPT,
SMP && !PREEMPT and !SMP && !PREEMPT on x86, !SMP && !PREEMPT and SMP &&
PREEMPT on x64. I also test-booted x86 with the generic_read_trylock
function to check that it works fine. Essentially the same patch has been
in testing as part of the voluntary-preempt patches for some time already.
NOTE to architecture maintainers: generic_raw_read_trylock() is a crude
version that should be replaced with the proper arch-optimized version
ASAP.
From: Hugh Dickins <hugh@veritas.com>
The i386 and x86_64 _raw_read_trylocks in preempt-smp.patch are too
successful: atomic_read() returns a signed integer.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/sched.h | 18 | ||||
| -rw-r--r-- | include/linux/spinlock.h | 23 |
2 files changed, 18 insertions, 23 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 26ee93b76485..425ee5e7c4b1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1072,23 +1072,7 @@ static inline void cond_resched(void) __cond_resched(); } -/* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -static inline void cond_resched_lock(spinlock_t * lock) -{ - if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); - spin_lock(lock); - } -} +extern int cond_resched_lock(spinlock_t * lock); /* Reevaluate whether the task has signals pending delivery. This is required every time the blocked sigset_t changes. diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 027c06f522bd..d6a455ed2dcb 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -41,6 +41,7 @@ #include <asm/spinlock.h> int __lockfunc _spin_trylock(spinlock_t *lock); +int __lockfunc _read_trylock(rwlock_t *lock); int __lockfunc _write_trylock(rwlock_t *lock); void __lockfunc _spin_lock(spinlock_t *lock) __acquires(spinlock_t); @@ -73,6 +74,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(rwlock_t); void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(rwlock_t); int __lockfunc _spin_trylock_bh(spinlock_t *lock); +int __lockfunc generic_raw_read_trylock(rwlock_t *lock); int in_lock_functions(unsigned long addr); #else @@ -219,11 +221,15 @@ typedef struct { #define _raw_read_unlock(lock) do { (void)(lock); } while(0) #define _raw_write_lock(lock) do { (void)(lock); } while(0) #define _raw_write_unlock(lock) do { (void)(lock); } while(0) +#define _raw_read_trylock(lock) ({ (void)(lock); (1); }) #define _raw_write_trylock(lock) ({ (void)(lock); (1); }) #define _spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \ 1 : ({preempt_enable(); 0;});}) +#define _read_trylock(lock) ({preempt_disable();_raw_read_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + #define _write_trylock(lock) ({preempt_disable(); _raw_write_trylock(lock) ? \ 1 : ({preempt_enable(); 0;});}) @@ -425,16 +431,12 @@ do { \ * methods are defined as nops in the case they are not required. */ #define spin_trylock(lock) __cond_lock(_spin_trylock(lock)) +#define read_trylock(lock) __cond_lock(_read_trylock(lock)) #define write_trylock(lock) __cond_lock(_write_trylock(lock)) -/* Where's read_trylock? */ - #define spin_lock(lock) _spin_lock(lock) #define write_lock(lock) _write_lock(lock) #define read_lock(lock) _read_lock(lock) -#define spin_unlock(lock) _spin_unlock(lock) -#define write_unlock(lock) _write_unlock(lock) -#define read_unlock(lock) _read_unlock(lock) #ifdef CONFIG_SMP #define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) @@ -454,6 +456,11 @@ do { \ #define write_lock_irq(lock) _write_lock_irq(lock) #define write_lock_bh(lock) _write_lock_bh(lock) + +#define spin_unlock(lock) _spin_unlock(lock) +#define write_unlock(lock) _write_unlock(lock) +#define read_unlock(lock) _read_unlock(lock) + #define spin_unlock_irqrestore(lock, flags) _spin_unlock_irqrestore(lock, flags) #define spin_unlock_irq(lock) _spin_unlock_irq(lock) #define spin_unlock_bh(lock) _spin_unlock_bh(lock) @@ -490,6 +497,7 @@ extern void _metered_read_lock (rwlock_t *lock); extern void _metered_read_unlock (rwlock_t *lock); extern void _metered_write_lock (rwlock_t *lock); extern void _metered_write_unlock (rwlock_t *lock); +extern int _metered_read_trylock (rwlock_t *lock); extern int _metered_write_trylock(rwlock_t *lock); #endif @@ -519,8 +527,11 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr) preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (test_and_set_bit(bitnum, addr)) { - while (test_bit(bitnum, addr)) + while (test_bit(bitnum, addr)) { + preempt_enable(); cpu_relax(); + preempt_disable(); + } } #endif __acquire(bitlock); |
