diff options
| author | Rusty Russell <rusty@rustcorp.com.au> | 2002-03-09 19:50:14 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@home.transmeta.com> | 2002-03-09 19:50:14 -0800 |
| commit | 882ad449046cec136c484dd2b3659fb4c683e0a3 (patch) | |
| tree | 363f84f0dc65135b84accd91a32e82e8f4810e68 /kernel/futex.c | |
| parent | ee7ae3cac7e27634267cde1159e84b069d3c7d6f (diff) | |
[PATCH] Futexes IV (Fast Lightweight Userspace Semaphores)
Fast user-space mutex implementation, allowing user space to do all
of the normal handling, with a minimal fallback to kernel space for
when there is lock contention.
The kernel space implementation does not keep any per-lock data
structures, but instead does a fast hash on the physical page and offset
of the user-space lock when contended. Thus no build/teardown costs, or
any scalability costs wrt metadata.
Updated syscall numbers for 2.5.6, and changed FUTEX_UP/DOWN definitions
to be more logical for future expansions (eg. r/w).
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/kernel/futex.c b/kernel/futex.c new file mode 100644 index 000000000000..4ef5ff0ff81b --- /dev/null +++ b/kernel/futex.c @@ -0,0 +1,232 @@ +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/hash.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/futex.h> +#include <linux/highmem.h> +#include <asm/atomic.h> + +/* These mutexes are a very simple counter: the winner is the one who + decrements from 1 to 0. The counter starts at 1 when the lock is + free. A value other than 0 or 1 means someone may be sleeping. + This is simple enough to work on all architectures, but has the + problem that if we never "up" the semaphore it could eventually + wrap around. */ + +/* FIXME: This may be way too small. --RR */ +#define FUTEX_HASHBITS 6 + +/* We use this instead of a normal wait_queue_t, so we can wake only + the relevent ones (hashed queues may be shared) */ +struct futex_q { + struct list_head list; + struct task_struct *task; + /* Page struct and offset within it. */ + struct page *page; + unsigned int offset; +}; + +/* The key for the hash is the address + index + offset within page */ +static struct list_head futex_queues[1<<FUTEX_HASHBITS]; +static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED; + +static inline struct list_head *hash_futex(struct page *page, + unsigned long offset) +{ + unsigned long h; + + /* struct page is shared, so we can hash on its address */ + h = (unsigned long)page + offset; + return &futex_queues[hash_long(h, FUTEX_HASHBITS)]; +} + +static inline void wake_one_waiter(struct list_head *head, + struct page *page, + unsigned int offset) +{ + struct list_head *i; + + spin_lock(&futex_lock); + list_for_each(i, head) { + struct futex_q *this = list_entry(i, struct futex_q, list); + + if (this->page == page && this->offset == offset) { + wake_up_process(this->task); + break; + } + } + spin_unlock(&futex_lock); +} + +/* Add at end to avoid starvation */ +static inline void queue_me(struct list_head *head, + struct futex_q *q, + struct page *page, + unsigned int offset) +{ + q->task = current; + q->page = page; + q->offset = offset; + + spin_lock(&futex_lock); + list_add_tail(&q->list, head); + spin_unlock(&futex_lock); +} + +static inline void unqueue_me(struct futex_q *q) +{ + spin_lock(&futex_lock); + list_del(&q->list); + spin_unlock(&futex_lock); +} + +/* Get kernel address of the user page and pin it. */ +static struct page *pin_page(unsigned long page_start) +{ + struct mm_struct *mm = current->mm; + struct page *page; + int err; + + down_read(&mm->mmap_sem); + err = get_user_pages(current, current->mm, page_start, + 1 /* one page */, + 1 /* writable */, + 0 /* don't force */, + &page, + NULL /* don't return vmas */); + up_read(&mm->mmap_sem); + + if (err < 0) + return ERR_PTR(err); + return page; +} + +/* Try to decrement the user count to zero. */ +static int decrement_to_zero(struct page *page, unsigned int offset) +{ + atomic_t *count; + int ret = 0; + + count = kmap(page) + offset; + /* If we take the semaphore from 1 to 0, it's ours. If it's + zero, decrement anyway, to indicate we are waiting. If + it's negative, don't decrement so we don't wrap... */ + if (atomic_read(count) >= 0 && atomic_dec_and_test(count)) + ret = 1; + kunmap(page); + return ret; +} + +/* Simplified from arch/ppc/kernel/semaphore.c: Paul M. is a genius. */ +static int futex_down(struct list_head *head, struct page *page, int offset) +{ + int retval = 0; + struct futex_q q; + + current->state = TASK_INTERRUPTIBLE; + queue_me(head, &q, page, offset); + + while (!decrement_to_zero(page, offset)) { + if (signal_pending(current)) { + retval = -EINTR; + break; + } + schedule(); + current->state = TASK_INTERRUPTIBLE; + } + current->state = TASK_RUNNING; + unqueue_me(&q); + /* If we were signalled, we might have just been woken: we + must wake another one. Otherwise we need to wake someone + else (if they are waiting) so they drop the count below 0, + and when we "up" in userspace, we know there is a + waiter. */ + wake_one_waiter(head, page, offset); + return retval; +} + +static int futex_up(struct list_head *head, struct page *page, int offset) +{ + atomic_t *count; + + count = kmap(page) + offset; + atomic_set(count, 1); + smp_wmb(); + kunmap(page); + wake_one_waiter(head, page, offset); + return 0; +} + +asmlinkage int sys_futex(void *uaddr, int op) +{ + int ret; + unsigned long pos_in_page; + struct list_head *head; + struct page *page; + + pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE; + + /* Must be "naturally" aligned, and not on page boundary. */ + if ((pos_in_page % __alignof__(atomic_t)) != 0 + || pos_in_page + sizeof(atomic_t) > PAGE_SIZE) + return -EINVAL; + + /* Simpler if it doesn't vanish underneath us. */ + page = pin_page((unsigned long)uaddr - pos_in_page); + if (IS_ERR(page)) + return PTR_ERR(page); + + head = hash_futex(page, pos_in_page); + switch (op) { + case FUTEX_UP: + ret = futex_up(head, page, pos_in_page); + break; + case FUTEX_DOWN: + ret = futex_down(head, page, pos_in_page); + break; + /* Add other lock types here... */ + default: + ret = -EINVAL; + } + put_page(page); + + return ret; +} + +static int __init init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) + INIT_LIST_HEAD(&futex_queues[i]); + return 0; +} +__initcall(init); |
