linux 2.6.16.38 w/ vs2.0.3-rc1
[linux-2.6.git] / kernel / futex.c
index 0977ce9..94d7d77 100644 (file)
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <asm/futex.h>
 
-#define FUTEX_HASHBITS 8
+#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
  * Futexes are matched on equal values of this key.
@@ -97,7 +99,6 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
-       unsigned int        nqueued;
        struct list_head       chain;
 };
 
@@ -200,23 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
         * from swap.  But that's a lot of code to duplicate here
         * for a rare case, so we simply fetch the page.
         */
-
-       /*
-        * Do a quick atomic lookup first - this is the fastpath.
-        */
-       spin_lock(&current->mm->page_table_lock);
-       page = follow_page(mm, uaddr, 0);
-       if (likely(page != NULL)) {
-               key->shared.pgoff =
-                       page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-               spin_unlock(&current->mm->page_table_lock);
-               return 0;
-       }
-       spin_unlock(&current->mm->page_table_lock);
-
-       /*
-        * Do it the general way.
-        */
        err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
        if (err >= 0) {
                key->shared.pgoff =
@@ -265,7 +249,6 @@ static inline int get_futex_value_locked(int *dest, int __user *from)
        inc_preempt_count();
        ret = __copy_from_user_inatomic(dest, from, sizeof(int));
        dec_preempt_count();
-       preempt_check_resched();
 
        return ret ? -EFAULT : 0;
 }
@@ -287,7 +270,13 @@ static void wake_futex(struct futex_q *q)
        /*
         * The waiting task can free the futex_q as soon as this is written,
         * without taking any locks.  This must come last.
+        *
+        * A memory barrier is required here to prevent the following store
+        * to lock_ptr from getting ahead of the wakeup. Clearing the lock
+        * at the end of wake_up_all() does not prevent this store from
+        * moving.
         */
+       wmb();
        q->lock_ptr = NULL;
 }
 
@@ -327,6 +316,130 @@ out:
        return ret;
 }
 
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+       union futex_key key1, key2;
+       struct futex_hash_bucket *bh1, *bh2;
+       struct list_head *head;
+       struct futex_q *this, *next;
+       int ret, op_ret, attempt = 0;
+
+retryfull:
+       down_read(&current->mm->mmap_sem);
+
+       ret = get_futex_key(uaddr1, &key1);
+       if (unlikely(ret != 0))
+               goto out;
+       ret = get_futex_key(uaddr2, &key2);
+       if (unlikely(ret != 0))
+               goto out;
+
+       bh1 = hash_futex(&key1);
+       bh2 = hash_futex(&key2);
+
+retry:
+       if (bh1 < bh2)
+               spin_lock(&bh1->lock);
+       spin_lock(&bh2->lock);
+       if (bh1 > bh2)
+               spin_lock(&bh1->lock);
+
+       op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+       if (unlikely(op_ret < 0)) {
+               int dummy;
+
+               spin_unlock(&bh1->lock);
+               if (bh1 != bh2)
+                       spin_unlock(&bh2->lock);
+
+#ifndef CONFIG_MMU
+               /* we don't get EFAULT from MMU faults if we don't have an MMU,
+                * but we might get them from range checking */
+               ret = op_ret;
+               goto out;
+#endif
+
+               if (unlikely(op_ret != -EFAULT)) {
+                       ret = op_ret;
+                       goto out;
+               }
+
+               /* futex_atomic_op_inuser needs to both read and write
+                * *(int __user *)uaddr2, but we can't modify it
+                * non-atomically.  Therefore, if get_user below is not
+                * enough, we need to handle the fault ourselves, while
+                * still holding the mmap_sem.  */
+               if (attempt++) {
+                       struct vm_area_struct * vma;
+                       struct mm_struct *mm = current->mm;
+
+                       ret = -EFAULT;
+                       if (attempt >= 2 ||
+                           !(vma = find_vma(mm, uaddr2)) ||
+                           vma->vm_start > uaddr2 ||
+                           !(vma->vm_flags & VM_WRITE))
+                               goto out;
+
+                       switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+                       case VM_FAULT_MINOR:
+                               current->min_flt++;
+                               break;
+                       case VM_FAULT_MAJOR:
+                               current->maj_flt++;
+                               break;
+                       default:
+                               goto out;
+                       }
+                       goto retry;
+               }
+
+               /* If we would have faulted, release mmap_sem,
+                * fault it in and start all over again.  */
+               up_read(&current->mm->mmap_sem);
+
+               ret = get_user(dummy, (int __user *)uaddr2);
+               if (ret)
+                       return ret;
+
+               goto retryfull;
+       }
+
+       head = &bh1->chain;
+
+       list_for_each_entry_safe(this, next, head, list) {
+               if (match_futex (&this->key, &key1)) {
+                       wake_futex(this);
+                       if (++ret >= nr_wake)
+                               break;
+               }
+       }
+
+       if (op_ret > 0) {
+               head = &bh2->chain;
+
+               op_ret = 0;
+               list_for_each_entry_safe(this, next, head, list) {
+                       if (match_futex (&this->key, &key2)) {
+                               wake_futex(this);
+                               if (++op_ret >= nr_wake2)
+                                       break;
+                       }
+               }
+               ret += op_ret;
+       }
+
+       spin_unlock(&bh1->lock);
+       if (bh1 != bh2)
+               spin_unlock(&bh2->lock);
+out:
+       up_read(&current->mm->mmap_sem);
+       return ret;
+}
+
 /*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
@@ -339,7 +452,6 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
        struct list_head *head1;
        struct futex_q *this, *next;
        int ret, drop_count = 0;
-       unsigned int nqueued;
 
  retry:
        down_read(&current->mm->mmap_sem);
@@ -354,23 +466,22 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
        bh1 = hash_futex(&key1);
        bh2 = hash_futex(&key2);
 
-       nqueued = bh1->nqueued;
+       if (bh1 < bh2)
+               spin_lock(&bh1->lock);
+       spin_lock(&bh2->lock);
+       if (bh1 > bh2)
+               spin_lock(&bh1->lock);
+
        if (likely(valp != NULL)) {
                int curval;
 
-               /* In order to avoid doing get_user while
-                  holding bh1->lock and bh2->lock, nqueued
-                  (monotonically increasing field) must be first
-                  read, then *uaddr1 fetched from userland and
-                  after acquiring lock nqueued field compared with
-                  the stored value.  The smp_mb () below
-                  makes sure that bh1->nqueued is read from memory
-                  before *uaddr1.  */
-               smp_mb();
-
                ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
 
                if (unlikely(ret)) {
+                       spin_unlock(&bh1->lock);
+                       if (bh1 != bh2)
+                               spin_unlock(&bh2->lock);
+
                        /* If we would have faulted, release mmap_sem, fault
                         * it in and start all over again.
                         */
@@ -385,21 +496,10 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
                }
                if (curval != *valp) {
                        ret = -EAGAIN;
-                       goto out;
+                       goto out_unlock;
                }
        }
 
-       if (bh1 < bh2)
-               spin_lock(&bh1->lock);
-       spin_lock(&bh2->lock);
-       if (bh1 > bh2)
-               spin_lock(&bh1->lock);
-
-       if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
-               ret = -EAGAIN;
-               goto out_unlock;
-       }
-
        head1 = &bh1->chain;
        list_for_each_entry_safe(this, next, head1, list) {
                if (!match_futex (&this->key, &key1))
@@ -435,13 +535,9 @@ out:
        return ret;
 }
 
-/*
- * queue_me and unqueue_me must be called as a pair, each
- * exactly once.  They are called with the hashed spinlock held.
- */
-
 /* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *
+queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
        struct futex_hash_bucket *bh;
 
@@ -455,11 +551,35 @@ static void queue_me(struct futex_q *q, int fd, struct file *filp)
        q->lock_ptr = &bh->lock;
 
        spin_lock(&bh->lock);
-       bh->nqueued++;
+       return bh;
+}
+
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+{
        list_add_tail(&q->list, &bh->chain);
        spin_unlock(&bh->lock);
 }
 
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+       spin_unlock(&bh->lock);
+       drop_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once.  They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static void queue_me(struct futex_q *q, int fd, struct file *filp)
+{
+       struct futex_hash_bucket *bh;
+       bh = queue_lock(q, fd, filp);
+       __queue_me(q, bh);
+}
+
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
@@ -469,6 +589,7 @@ static int unqueue_me(struct futex_q *q)
        /* In the common case we don't take the spinlock, which is nice. */
  retry:
        lock_ptr = q->lock_ptr;
+       barrier();
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
                /*
@@ -503,6 +624,7 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
        DECLARE_WAITQUEUE(wait, current);
        int ret, curval;
        struct futex_q q;
+       struct futex_hash_bucket *bh;
 
  retry:
        down_read(&current->mm->mmap_sem);
@@ -511,7 +633,7 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
        if (unlikely(ret != 0))
                goto out_release_sem;
 
-       queue_me(&q, -1, NULL);
+       bh = queue_lock(&q, -1, NULL);
 
        /*
         * Access the page AFTER the futex is queued.
@@ -537,14 +659,13 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
        ret = get_futex_value_locked(&curval, (int __user *)uaddr);
 
        if (unlikely(ret)) {
+               queue_unlock(&q, bh);
+
                /* If we would have faulted, release mmap_sem, fault it in and
                 * start all over again.
                 */
                up_read(&current->mm->mmap_sem);
 
-               if (!unqueue_me(&q)) /* There's a chance we got woken already */
-                       return 0;
-
                ret = get_user(curval, (int __user *)uaddr);
 
                if (!ret)
@@ -553,9 +674,13 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
        }
        if (curval != val) {
                ret = -EWOULDBLOCK;
-               goto out_unqueue;
+               queue_unlock(&q, bh);
+               goto out_release_sem;
        }
 
+       /* Only actually queue if *uaddr contained val.  */
+       __queue_me(&q, bh);
+
        /*
         * Now the futex is queued and we have checked the data, we
         * don't want to hold mmap_sem while we sleep.
@@ -596,10 +721,6 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
         * have handled it for us already. */
        return -EINTR;
 
- out_unqueue:
-       /* If we were woken (and unqueued), we succeeded, whatever. */
-       if (!unqueue_me(&q))
-               ret = 0;
  out_release_sem:
        up_read(&current->mm->mmap_sem);
        return ret;
@@ -649,7 +770,7 @@ static int futex_fd(unsigned long uaddr, int signal)
        int ret, err;
 
        ret = -EINVAL;
-       if (signal < 0 || signal > _NSIG)
+       if (!valid_signal(signal))
                goto out;
 
        ret = get_unused_fd();
@@ -667,23 +788,17 @@ static int futex_fd(unsigned long uaddr, int signal)
        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
 
        if (signal) {
-               int err;
                err = f_setown(filp, current->pid, 1);
                if (err < 0) {
-                       put_unused_fd(ret);
-                       put_filp(filp);
-                       ret = err;
-                       goto out;
+                       goto error;
                }
                filp->f_owner.signum = signal;
        }
 
        q = kmalloc(sizeof(*q), GFP_KERNEL);
        if (!q) {
-               put_unused_fd(ret);
-               put_filp(filp);
-               ret = -ENOMEM;
-               goto out;
+               err = -ENOMEM;
+               goto error;
        }
 
        down_read(&current->mm->mmap_sem);
@@ -691,10 +806,8 @@ static int futex_fd(unsigned long uaddr, int signal)
 
        if (unlikely(err != 0)) {
                up_read(&current->mm->mmap_sem);
-               put_unused_fd(ret);
-               put_filp(filp);
                kfree(q);
-               return err;
+               goto error;
        }
 
        /*
@@ -710,6 +823,11 @@ static int futex_fd(unsigned long uaddr, int signal)
        fd_install(ret, filp);
 out:
        return ret;
+error:
+       put_unused_fd(ret);
+       put_filp(filp);
+       ret = err;
+       goto out;
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -734,6 +852,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
        case FUTEX_CMP_REQUEUE:
                ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
                break;
+       case FUTEX_WAKE_OP:
+               ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+               break;
        default:
                ret = -ENOSYS;
        }