* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
* Removed page pinning, fix privately mapped COW pages and other cleanups
- * (C) Copyright 2003 Jamie Lokier
+ * (C) Copyright 2003, 2004 Jamie Lokier
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <asm/futex.h>
-#define FUTEX_HASHBITS 8
+#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
* Futexes are matched on equal values of this key.
*/
struct futex_hash_bucket {
spinlock_t lock;
- unsigned int nqueued;
struct list_head chain;
};
* from swap. But that's a lot of code to duplicate here
* for a rare case, so we simply fetch the page.
*/
-
- /*
- * Do a quick atomic lookup first - this is the fastpath.
- */
- spin_lock(¤t->mm->page_table_lock);
- page = follow_page(mm, uaddr, 0);
- if (likely(page != NULL)) {
- key->shared.pgoff =
- page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- spin_unlock(¤t->mm->page_table_lock);
- return 0;
- }
- spin_unlock(¤t->mm->page_table_lock);
-
- /*
- * Do it the general way.
- */
err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
if (err >= 0) {
key->shared.pgoff =
}
}
+static inline int get_futex_value_locked(int *dest, int __user *from)
+{
+ int ret;
+
+ inc_preempt_count();
+ ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+ dec_preempt_count();
+
+ return ret ? -EFAULT : 0;
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
/*
* The waiting task can free the futex_q as soon as this is written,
* without taking any locks. This must come last.
+ *
+ * A memory barrier is required here to prevent the following store
+ * to lock_ptr from getting ahead of the wakeup. Clearing the lock
+ * at the end of wake_up_all() does not prevent this store from
+ * moving.
*/
+ wmb();
q->lock_ptr = NULL;
}
return ret;
}
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *bh1, *bh2;
+ struct list_head *head;
+ struct futex_q *this, *next;
+ int ret, op_ret, attempt = 0;
+
+retryfull:
+ down_read(¤t->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr1, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+
+retry:
+ if (bh1 < bh2)
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+
+ op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+ if (unlikely(op_ret < 0)) {
+ int dummy;
+
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+
+#ifndef CONFIG_MMU
+ /* we don't get EFAULT from MMU faults if we don't have an MMU,
+ * but we might get them from range checking */
+ ret = op_ret;
+ goto out;
+#endif
+
+ if (unlikely(op_ret != -EFAULT)) {
+ ret = op_ret;
+ goto out;
+ }
+
+ /* futex_atomic_op_inuser needs to both read and write
+ * *(int __user *)uaddr2, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem. */
+ if (attempt++) {
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+
+ ret = -EFAULT;
+ if (attempt >= 2 ||
+ !(vma = find_vma(mm, uaddr2)) ||
+ vma->vm_start > uaddr2 ||
+ !(vma->vm_flags & VM_WRITE))
+ goto out;
+
+ switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ default:
+ goto out;
+ }
+ goto retry;
+ }
+
+ /* If we would have faulted, release mmap_sem,
+ * fault it in and start all over again. */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(dummy, (int __user *)uaddr2);
+ if (ret)
+ return ret;
+
+ goto retryfull;
+ }
+
+ head = &bh1->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key1)) {
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ head = &bh2->chain;
+
+ op_ret = 0;
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key2)) {
+ wake_futex(this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+out:
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
/*
* Requeue all waiters hashed on one physical page to another
* physical page.
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
- unsigned int nqueued;
+ retry:
down_read(¤t->mm->mmap_sem);
ret = get_futex_key(uaddr1, &key1);
bh1 = hash_futex(&key1);
bh2 = hash_futex(&key2);
- nqueued = bh1->nqueued;
+ if (bh1 < bh2)
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+
if (likely(valp != NULL)) {
int curval;
- /* In order to avoid doing get_user while
- holding bh1->lock and bh2->lock, nqueued
- (monotonically increasing field) must be first
- read, then *uaddr1 fetched from userland and
- after acquiring lock nqueued field compared with
- the stored value. The smp_mb () below
- makes sure that bh1->nqueued is read from memory
- before *uaddr1. */
- smp_mb();
-
- if (get_user(curval, (int __user *)uaddr1) != 0) {
- ret = -EFAULT;
- goto out;
+ ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+
+ /* If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(curval, (int __user *)uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
}
if (curval != *valp) {
ret = -EAGAIN;
- goto out;
+ goto out_unlock;
}
}
- if (bh1 < bh2)
- spin_lock(&bh1->lock);
- spin_lock(&bh2->lock);
- if (bh1 > bh2)
- spin_lock(&bh1->lock);
-
- if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
-
head1 = &bh1->chain;
list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
return ret;
}
-/*
- * queue_me and unqueue_me must be called as a pair, each
- * exactly once. They are called with the hashed spinlock held.
- */
-
/* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *
+queue_lock(struct futex_q *q, int fd, struct file *filp)
{
struct futex_hash_bucket *bh;
q->lock_ptr = &bh->lock;
spin_lock(&bh->lock);
- bh->nqueued++;
+ return bh;
+}
+
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+{
list_add_tail(&q->list, &bh->chain);
spin_unlock(&bh->lock);
}
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+ spin_unlock(&bh->lock);
+ drop_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once. They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static void queue_me(struct futex_q *q, int fd, struct file *filp)
+{
+ struct futex_hash_bucket *bh;
+ bh = queue_lock(q, fd, filp);
+ __queue_me(q, bh);
+}
+
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
/* In the common case we don't take the spinlock, which is nice. */
retry:
lock_ptr = q->lock_ptr;
+ barrier();
if (lock_ptr != 0) {
spin_lock(lock_ptr);
/*
DECLARE_WAITQUEUE(wait, current);
int ret, curval;
struct futex_q q;
+ struct futex_hash_bucket *bh;
+ retry:
down_read(¤t->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
- queue_me(&q, -1, NULL);
+ bh = queue_lock(&q, -1, NULL);
/*
- * Access the page after the futex is queued.
+ * Access the page AFTER the futex is queued.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we queued after testing *uaddr, that would open
+ * a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * A consequence is that futex_wait() can return zero and absorb
+ * a wakeup when *uaddr != val on entry to the syscall. This is
+ * rare, but normal.
+ *
* We hold the mmap semaphore, so the mapping cannot have changed
- * since we looked it up.
+ * since we looked it up in get_futex_key.
*/
- if (get_user(curval, (int __user *)uaddr) != 0) {
- ret = -EFAULT;
- goto out_unqueue;
+
+ ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+
+ if (unlikely(ret)) {
+ queue_unlock(&q, bh);
+
+ /* If we would have faulted, release mmap_sem, fault it in and
+ * start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(curval, (int __user *)uaddr);
+
+ if (!ret)
+ goto retry;
+ return ret;
}
if (curval != val) {
ret = -EWOULDBLOCK;
- goto out_unqueue;
+ queue_unlock(&q, bh);
+ goto out_release_sem;
}
+ /* Only actually queue if *uaddr contained val. */
+ __queue_me(&q, bh);
+
/*
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
return 0;
if (time == 0)
return -ETIMEDOUT;
- /* A spurious wakeup should never happen. */
- WARN_ON(!signal_pending(current));
+ /* We expect signal_pending(current), but another thread may
+ * have handled it for us already. */
return -EINTR;
- out_unqueue:
- /* If we were woken (and unqueued), we succeeded, whatever. */
- if (!unqueue_me(&q))
- ret = 0;
out_release_sem:
up_read(¤t->mm->mmap_sem);
return ret;
int ret, err;
ret = -EINVAL;
- if (signal < 0 || signal > _NSIG)
+ if (!valid_signal(signal))
goto out;
ret = get_unused_fd();
filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
if (signal) {
- int err;
err = f_setown(filp, current->pid, 1);
if (err < 0) {
- put_unused_fd(ret);
- put_filp(filp);
- ret = err;
- goto out;
+ goto error;
}
filp->f_owner.signum = signal;
}
q = kmalloc(sizeof(*q), GFP_KERNEL);
if (!q) {
- put_unused_fd(ret);
- put_filp(filp);
- ret = -ENOMEM;
- goto out;
+ err = -ENOMEM;
+ goto error;
}
down_read(¤t->mm->mmap_sem);
if (unlikely(err != 0)) {
up_read(¤t->mm->mmap_sem);
- put_unused_fd(ret);
- put_filp(filp);
kfree(q);
- return err;
+ goto error;
}
/*
fd_install(ret, filp);
out:
return ret;
+error:
+ put_unused_fd(ret);
+ put_filp(filp);
+ ret = err;
+ goto out;
}
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
break;
+ case FUTEX_WAKE_OP:
+ ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ break;
default:
ret = -ENOSYS;
}
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
INIT_LIST_HEAD(&futex_queues[i].chain);
- futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&futex_queues[i].lock);
}
return 0;
}