* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
* Removed page pinning, fix privately mapped COW pages and other cleanups
- * (C) Copyright 2003 Jamie Lokier
+ * (C) Copyright 2003, 2004 Jamie Lokier
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
+#include <linux/syscalls.h>
#define FUTEX_HASHBITS 8
*/
struct futex_hash_bucket {
spinlock_t lock;
+ unsigned int nqueued;
struct list_head chain;
};
}
}
+static inline int get_futex_value_locked(int *dest, int __user *from)
+{
+ int ret;
+
+ inc_preempt_count();
+ ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+ dec_preempt_count();
+ preempt_check_resched();
+
+ return ret ? -EFAULT : 0;
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
* The waiting task can free the futex_q as soon as this is written,
* without taking any locks. This must come last.
*/
- q->lock_ptr = 0;
+ q->lock_ptr = NULL;
}
/*
* physical page.
*/
static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
- int nr_wake, int nr_requeue)
+ int nr_wake, int nr_requeue, int *valp)
{
union futex_key key1, key2;
struct futex_hash_bucket *bh1, *bh2;
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
+ unsigned int nqueued;
+ retry:
down_read(¤t->mm->mmap_sem);
ret = get_futex_key(uaddr1, &key1);
bh1 = hash_futex(&key1);
bh2 = hash_futex(&key2);
+ nqueued = bh1->nqueued;
+ if (likely(valp != NULL)) {
+ int curval;
+
+ /* In order to avoid doing get_user while
+ holding bh1->lock and bh2->lock, nqueued
+ (monotonically increasing field) must be first
+ read, then *uaddr1 fetched from userland and
+ after acquiring lock nqueued field compared with
+ the stored value. The smp_mb () below
+ makes sure that bh1->nqueued is read from memory
+ before *uaddr1. */
+ smp_mb();
+
+ ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+
+ if (unlikely(ret)) {
+ /* If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ ret = get_user(curval, (int __user *)uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *valp) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
if (bh1 < bh2)
spin_lock(&bh1->lock);
spin_lock(&bh2->lock);
if (bh1 > bh2)
spin_lock(&bh1->lock);
+ if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
head1 = &bh1->chain;
list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
}
}
+out_unlock:
spin_unlock(&bh1->lock);
if (bh1 != bh2)
spin_unlock(&bh2->lock);
q->lock_ptr = &bh->lock;
spin_lock(&bh->lock);
+ bh->nqueued++;
list_add_tail(&q->list, &bh->chain);
spin_unlock(&bh->lock);
}
int ret, curval;
struct futex_q q;
+ retry:
down_read(¤t->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key);
queue_me(&q, -1, NULL);
/*
- * Access the page after the futex is queued.
+ * Access the page AFTER the futex is queued.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we queued after testing *uaddr, that would open
+ * a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * A consequence is that futex_wait() can return zero and absorb
+ * a wakeup when *uaddr != val on entry to the syscall. This is
+ * rare, but normal.
+ *
* We hold the mmap semaphore, so the mapping cannot have changed
- * since we looked it up.
+ * since we looked it up in get_futex_key.
*/
- if (get_user(curval, (int *)uaddr) != 0) {
- ret = -EFAULT;
- goto out_unqueue;
+
+ ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+
+ if (unlikely(ret)) {
+ /* If we would have faulted, release mmap_sem, fault it in and
+ * start all over again.
+ */
+ up_read(¤t->mm->mmap_sem);
+
+ if (!unqueue_me(&q)) /* There's a chance we got woken already */
+ return 0;
+
+ ret = get_user(curval, (int __user *)uaddr);
+
+ if (!ret)
+ goto retry;
+ return ret;
}
if (curval != val) {
ret = -EWOULDBLOCK;
return 0;
if (time == 0)
return -ETIMEDOUT;
- /* A spurious wakeup should never happen. */
- WARN_ON(!signal_pending(current));
+ /* We expect signal_pending(current), but another thread may
+ * have handled it for us already. */
return -EINTR;
out_unqueue:
}
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
- unsigned long uaddr2, int val2)
+ unsigned long uaddr2, int val2, int val3)
{
int ret;
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2);
+ ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ break;
+ case FUTEX_CMP_REQUEUE:
+ ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
- struct timespec __user *utime, u32 __user *uaddr2)
+ struct timespec __user *utime, u32 __user *uaddr2,
+ int val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE)
- val2 = (int) (long) utime;
+ if (op >= FUTEX_REQUEUE)
+ val2 = (int) (unsigned long) utime;
return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2);
+ (unsigned long)uaddr2, val2, val3);
}
static struct super_block *
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
INIT_LIST_HEAD(&futex_queues[i].chain);
- futex_queues[i].lock = SPIN_LOCK_UNLOCKED;
+ spin_lock_init(&futex_queues[i].lock);
}
return 0;
}