summaryrefslogtreecommitdiff
path: root/kernel/futex.c
blob: 4ef5ff0ff81b05487dfa3d47b9f3cc54a1598c77 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/kernel.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/futex.h>
#include <linux/highmem.h>
#include <asm/atomic.h>

/* These mutexes are a very simple counter: the winner is the one who
   decrements from 1 to 0.  The counter starts at 1 when the lock is
   free.  A value other than 0 or 1 means someone may be sleeping.
   This is simple enough to work on all architectures, but has the
   problem that if we never "up" the semaphore it could eventually
   wrap around. */

/* FIXME: This may be way too small. --RR */
#define FUTEX_HASHBITS 6

/* We use this instead of a normal wait_queue_t, so we can wake only
   the relevent ones (hashed queues may be shared) */
struct futex_q {
	struct list_head list;
	struct task_struct *task;
	/* Page struct and offset within it. */
	struct page *page;
	unsigned int offset;
};

/* The key for the hash is the address + index + offset within page */
static struct list_head futex_queues[1<<FUTEX_HASHBITS];
static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;

static inline struct list_head *hash_futex(struct page *page,
					   unsigned long offset)
{
	unsigned long h;

	/* struct page is shared, so we can hash on its address */
	h = (unsigned long)page + offset;
	return &futex_queues[hash_long(h, FUTEX_HASHBITS)];
}

static inline void wake_one_waiter(struct list_head *head,
				   struct page *page,
				   unsigned int offset)
{
	struct list_head *i;

	spin_lock(&futex_lock);
	list_for_each(i, head) {
		struct futex_q *this = list_entry(i, struct futex_q, list);

		if (this->page == page && this->offset == offset) {
			wake_up_process(this->task);
			break;
		}
	}
	spin_unlock(&futex_lock);
}

/* Add at end to avoid starvation */
static inline void queue_me(struct list_head *head,
			    struct futex_q *q,
			    struct page *page,
			    unsigned int offset)
{
	q->task = current;
	q->page = page;
	q->offset = offset;

	spin_lock(&futex_lock);
	list_add_tail(&q->list, head);
	spin_unlock(&futex_lock);
}

static inline void unqueue_me(struct futex_q *q)
{
	spin_lock(&futex_lock);
	list_del(&q->list);
	spin_unlock(&futex_lock);
}

/* Get kernel address of the user page and pin it. */
static struct page *pin_page(unsigned long page_start)
{
	struct mm_struct *mm = current->mm;
	struct page *page;
	int err;

	down_read(&mm->mmap_sem);
	err = get_user_pages(current, current->mm, page_start,
			     1 /* one page */,
			     1 /* writable */,
			     0 /* don't force */,
			     &page,
			     NULL /* don't return vmas */);
	up_read(&mm->mmap_sem);

	if (err < 0)
		return ERR_PTR(err);
	return page;
}

/* Try to decrement the user count to zero. */
static int decrement_to_zero(struct page *page, unsigned int offset)
{
	atomic_t *count;
	int ret = 0;

	count = kmap(page) + offset;
	/* If we take the semaphore from 1 to 0, it's ours.  If it's
           zero, decrement anyway, to indicate we are waiting.  If
           it's negative, don't decrement so we don't wrap... */
	if (atomic_read(count) >= 0 && atomic_dec_and_test(count))
		ret = 1;
	kunmap(page);
	return ret;
}

/* Simplified from arch/ppc/kernel/semaphore.c: Paul M. is a genius. */
static int futex_down(struct list_head *head, struct page *page, int offset)
{
	int retval = 0;
	struct futex_q q;

	current->state = TASK_INTERRUPTIBLE;
	queue_me(head, &q, page, offset);

	while (!decrement_to_zero(page, offset)) {
		if (signal_pending(current)) {
			retval = -EINTR;
			break;
		}
		schedule();
		current->state = TASK_INTERRUPTIBLE;
	}
	current->state = TASK_RUNNING;
	unqueue_me(&q);
	/* If we were signalled, we might have just been woken: we
	   must wake another one.  Otherwise we need to wake someone
	   else (if they are waiting) so they drop the count below 0,
	   and when we "up" in userspace, we know there is a
	   waiter. */
	wake_one_waiter(head, page, offset);
	return retval;
}

static int futex_up(struct list_head *head, struct page *page, int offset)
{
	atomic_t *count;

	count = kmap(page) + offset;
	atomic_set(count, 1);
	smp_wmb();
	kunmap(page);
	wake_one_waiter(head, page, offset);
	return 0;
}

asmlinkage int sys_futex(void *uaddr, int op)
{
	int ret;
	unsigned long pos_in_page;
	struct list_head *head;
	struct page *page;

	pos_in_page = ((unsigned long)uaddr) % PAGE_SIZE;

	/* Must be "naturally" aligned, and not on page boundary. */
	if ((pos_in_page % __alignof__(atomic_t)) != 0
	    || pos_in_page + sizeof(atomic_t) > PAGE_SIZE)
		return -EINVAL;

	/* Simpler if it doesn't vanish underneath us. */
	page = pin_page((unsigned long)uaddr - pos_in_page);
	if (IS_ERR(page))
		return PTR_ERR(page);

	head = hash_futex(page, pos_in_page);
	switch (op) {
	case FUTEX_UP:
		ret = futex_up(head, page, pos_in_page);
		break;
	case FUTEX_DOWN:
		ret = futex_down(head, page, pos_in_page);
		break;
	/* Add other lock types here... */
	default:
		ret = -EINVAL;
	}
	put_page(page);

	return ret;
}

static int __init init(void)
{
	unsigned int i;

	for (i = 0; i < ARRAY_SIZE(futex_queues); i++)
		INIT_LIST_HEAD(&futex_queues[i]);
	return 0;
}
__initcall(init);