X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Ffutex.c;h=c27bb560b469052fcbd579e6ed0d04c782c5ba0b;hb=9464c7cf61b9433057924c36e6e02f303a00e768;hp=9d260e838cffdca6f951d6625626ab9d62f19d81;hpb=41689045f6a3cbe0550e1d34e9cc20d2e8c432ba;p=linux-2.6.git diff --git a/kernel/futex.c b/kernel/futex.c index 9d260e838..c27bb560b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,10 +12,6 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * - * PI-futex support started by Ingo Molnar and Thomas Gleixner - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -48,10 +44,9 @@ #include #include #include +#include #include -#include "rtmutex_common.h" - #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -69,7 +64,7 @@ union futex_key { int offset; } shared; struct { - unsigned long address; + unsigned long uaddr; struct mm_struct *mm; int offset; } private; @@ -80,27 +75,6 @@ union futex_key { } both; }; -/* - * Priority Inheritance state: - */ -struct futex_pi_state { - /* - * list of 'owned' pi_state instances - these have to be - * cleaned up in do_exit() if the task exits prematurely: - */ - struct list_head list; - - /* - * The PI object: - */ - struct rt_mutex pi_mutex; - - struct task_struct *owner; - atomic_t refcount; - - union futex_key key; -}; - /* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -114,19 +88,15 @@ struct futex_q { struct list_head list; wait_queue_head_t waiters; - /* Which hash list lock to use: */ + /* Which hash list lock to use. */ spinlock_t *lock_ptr; - /* Key which the futex is hashed on: */ + /* Key which the futex is hashed on. */ union futex_key key; - /* For fd, sigio sent using these: */ + /* For fd, sigio sent using these. */ int fd; struct file *filp; - - /* Optional priority inheritance state: */ - struct futex_pi_state *pi_state; - struct task_struct *task; }; /* @@ -175,9 +145,8 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, union futex_key *key) +static int get_futex_key(unsigned long uaddr, union futex_key *key) { - unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct page *page; @@ -186,16 +155,16 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) /* * The futex address must be "naturally" aligned. */ - key->both.offset = address % PAGE_SIZE; + key->both.offset = uaddr % PAGE_SIZE; if (unlikely((key->both.offset % sizeof(u32)) != 0)) return -EINVAL; - address -= key->both.offset; + uaddr -= key->both.offset; /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ - vma = find_extend_vma(mm, address); + vma = find_extend_vma(mm, uaddr); if (unlikely(!vma)) return -EFAULT; @@ -216,7 +185,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { key->private.mm = mm; - key->private.address = address; + key->private.uaddr = uaddr; return 0; } @@ -226,7 +195,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) key->shared.inode = vma->vm_file->f_dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); return 0; } @@ -237,7 +206,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) * from swap. But that's a lot of code to duplicate here * for a rare case, so we simply fetch the page. */ - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); if (err >= 0) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -278,258 +247,17 @@ static void drop_key_refs(union futex_key *key) } } -static inline int get_futex_value_locked(u32 *dest, u32 __user *from) +static inline int get_futex_value_locked(int *dest, int __user *from) { int ret; inc_preempt_count(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __copy_from_user_inatomic(dest, from, sizeof(int)); dec_preempt_count(); return ret ? -EFAULT : 0; } -/* - * Fault handling. Called with current->mm->mmap_sem held. - */ -static int futex_handle_fault(unsigned long address, int attempt) -{ - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - - if (attempt > 2 || !(vma = find_vma(mm, address)) || - vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) - return -EFAULT; - - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - return -EFAULT; - } - return 0; -} - -/* - * PI code: - */ -static int refill_pi_state_cache(void) -{ - struct futex_pi_state *pi_state; - - if (likely(current->pi_state_cache)) - return 0; - - pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); - - if (!pi_state) - return -ENOMEM; - - memset(pi_state, 0, sizeof(*pi_state)); - INIT_LIST_HEAD(&pi_state->list); - /* pi_mutex gets initialized later */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - - current->pi_state_cache = pi_state; - - return 0; -} - -static struct futex_pi_state * alloc_pi_state(void) -{ - struct futex_pi_state *pi_state = current->pi_state_cache; - - WARN_ON(!pi_state); - current->pi_state_cache = NULL; - - return pi_state; -} - -static void free_pi_state(struct futex_pi_state *pi_state) -{ - if (!atomic_dec_and_test(&pi_state->refcount)) - return; - - /* - * If pi_state->owner is NULL, the owner is most probably dying - * and has cleaned up the pi_state already - */ - if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); - - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); - } - - if (current->pi_state_cache) - kfree(pi_state); - else { - /* - * pi_state->list is already empty. - * clear pi_state->owner. - * refcount is at 0 - put it back to 1. - */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - current->pi_state_cache = pi_state; - } -} - -/* - * Look up the task based on what TID userspace gave us. - * We dont trust it. - */ -static struct task_struct * futex_find_get_task(pid_t pid) -{ - struct task_struct *p; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (!p) - goto out_unlock; - if ((current->euid != p->euid) && (current->euid != p->uid)) { - p = NULL; - goto out_unlock; - } - if (p->exit_state != 0) { - p = NULL; - goto out_unlock; - } - get_task_struct(p); -out_unlock: - read_unlock(&tasklist_lock); - - return p; -} - -/* - * This task is holding PI mutexes at exit time => bad. - * Kernel cleans up PI-state, but userspace is likely hosed. - * (Robust-futex cleanup is separate and might save the day for userspace.) - */ -void exit_pi_state_list(struct task_struct *curr) -{ - struct list_head *next, *head = &curr->pi_state_list; - struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; - union futex_key key; - - /* - * We are a ZOMBIE and nobody can enqueue itself on - * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselves: - */ - spin_lock_irq(&curr->pi_lock); - while (!list_empty(head)) { - - next = head->next; - pi_state = list_entry(next, struct futex_pi_state, list); - key = pi_state->key; - hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); - - spin_lock(&hb->lock); - - spin_lock_irq(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - spin_unlock(&hb->lock); - continue; - } - - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); - - rt_mutex_unlock(&pi_state->pi_mutex); - - spin_unlock(&hb->lock); - - spin_lock_irq(&curr->pi_lock); - } - spin_unlock_irq(&curr->pi_lock); -} - -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) -{ - struct futex_pi_state *pi_state = NULL; - struct futex_q *this, *next; - struct list_head *head; - struct task_struct *p; - pid_t pid; - - head = &hb->chain; - - list_for_each_entry_safe(this, next, head, list) { - if (match_futex(&this->key, &me->key)) { - /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: - */ - pi_state = this->pi_state; - /* - * Userspace might have messed up non PI and PI futexes - */ - if (unlikely(!pi_state)) - return -EINVAL; - - WARN_ON(!atomic_read(&pi_state->refcount)); - - atomic_inc(&pi_state->refcount); - me->pi_state = pi_state; - - return 0; - } - } - - /* - * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when the owner died bit is set - * and TID = 0: - */ - pid = uval & FUTEX_TID_MASK; - if (!pid && (uval & FUTEX_OWNER_DIED)) - return -ESRCH; - p = futex_find_get_task(pid); - if (!p) - return -ESRCH; - - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make 'p' - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = me->key; - - spin_lock_irq(&p->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); - - put_task_struct(p); - - me->pi_state = pi_state; - - return 0; -} - /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -557,105 +285,16 @@ static void wake_futex(struct futex_q *q) q->lock_ptr = NULL; } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) -{ - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; - u32 curval, newval; - - if (!pi_state) - return -EINVAL; - - new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - - /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) - */ - if (!(uval & FUTEX_OWNER_DIED)) { - newval = FUTEX_WAITERS | new_owner->pid; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; - } - - spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); - - spin_lock_irq(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); - - rt_mutex_unlock(&pi_state->pi_mutex); - - return 0; -} - -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ - u32 oldval; - - /* - * There is no waiter, so we unlock the futex. The owner died - * bit has not to be preserved here. We are the owner: - */ - inc_preempt_count(); - oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); - dec_preempt_count(); - - if (oldval == -EFAULT) - return oldval; - if (oldval != uval) - return -EAGAIN; - - return 0; -} - -/* - * Express the locking dependencies for lockdep: - */ -static inline void -double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - if (hb1 <= hb2) { - spin_lock(&hb1->lock); - if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); - } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); - } -} - /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(u32 __user *uaddr, int nr_wake) +static int futex_wake(unsigned long uaddr, int nr_wake) { - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - struct list_head *head; union futex_key key; + struct futex_hash_bucket *bh; + struct list_head *head; + struct futex_q *this, *next; int ret; down_read(¤t->mm->mmap_sem); @@ -664,23 +303,19 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) if (unlikely(ret != 0)) goto out; - hb = hash_futex(&key); - spin_lock(&hb->lock); - head = &hb->chain; + bh = hash_futex(&key); + spin_lock(&bh->lock); + head = &bh->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { - ret = -EINVAL; - break; - } wake_futex(this); if (++ret >= nr_wake) break; } } - spin_unlock(&hb->lock); + spin_unlock(&bh->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -690,12 +325,10 @@ out: * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int -futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, - int nr_wake, int nr_wake2, int op) +static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; - struct futex_hash_bucket *hb1, *hb2; + struct futex_hash_bucket *bh1, *bh2; struct list_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; @@ -710,25 +343,27 @@ retryfull: if (unlikely(ret != 0)) goto out; - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); retry: - double_lock_hb(hb1, hb2); + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); - op_ret = futex_atomic_op_inuser(op, uaddr2); + op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); if (unlikely(op_ret < 0)) { - u32 dummy; + int dummy; - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); #ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ + /* we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking */ ret = op_ret; goto out; #endif @@ -738,36 +373,47 @@ retry: goto out; } - /* - * futex_atomic_op_inuser needs to both read and write + /* futex_atomic_op_inuser needs to both read and write * *(int __user *)uaddr2, but we can't modify it * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - */ + * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr2, - attempt)) { - ret = -EFAULT; + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + ret = -EFAULT; + if (attempt >= 2 || + !(vma = find_vma(mm, uaddr2)) || + vma->vm_start > uaddr2 || + !(vma->vm_flags & VM_WRITE)) + goto out; + + switch (handle_mm_fault(mm, vma, uaddr2, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: goto out; } goto retry; } - /* - * If we would have faulted, release mmap_sem, - * fault it in and start all over again. - */ + /* If we would have faulted, release mmap_sem, + * fault it in and start all over again. */ up_read(¤t->mm->mmap_sem); - ret = get_user(dummy, uaddr2); + ret = get_user(dummy, (int __user *)uaddr2); if (ret) return ret; goto retryfull; } - head = &hb1->chain; + head = &bh1->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { @@ -778,7 +424,7 @@ retry: } if (op_ret > 0) { - head = &hb2->chain; + head = &bh2->chain; op_ret = 0; list_for_each_entry_safe(this, next, head, list) { @@ -791,9 +437,9 @@ retry: ret += op_ret; } - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -803,11 +449,11 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) +static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, + int nr_wake, int nr_requeue, int *valp) { union futex_key key1, key2; - struct futex_hash_bucket *hb1, *hb2; + struct futex_hash_bucket *bh1, *bh2; struct list_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; @@ -822,68 +468,68 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, if (unlikely(ret != 0)) goto out; - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); - if (likely(cmpval != NULL)) { - u32 curval; + if (likely(valp != NULL)) { + int curval; - ret = get_futex_value_locked(&curval, uaddr1); + ret = get_futex_value_locked(&curval, (int __user *)uaddr1); if (unlikely(ret)) { - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); - /* - * If we would have faulted, release mmap_sem, fault + /* If we would have faulted, release mmap_sem, fault * it in and start all over again. */ up_read(¤t->mm->mmap_sem); - ret = get_user(curval, uaddr1); + ret = get_user(curval, (int __user *)uaddr1); if (!ret) goto retry; return ret; } - if (curval != *cmpval) { + if (curval != *valp) { ret = -EAGAIN; goto out_unlock; } } - head1 = &hb1->chain; + head1 = &bh1->chain; list_for_each_entry_safe(this, next, head1, list) { if (!match_futex (&this->key, &key1)) continue; if (++ret <= nr_wake) { wake_futex(this); } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - list_move_tail(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; - } + list_move_tail(&this->list, &bh2->chain); + this->lock_ptr = &bh2->lock; this->key = key2; get_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) break; + /* Make sure to stop if key1 == key2 */ + if (head1 == &bh2->chain && head1 != &next->list) + head1 = &this->list; } } out_unlock: - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); /* drop_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) @@ -898,7 +544,7 @@ out: static inline struct futex_hash_bucket * queue_lock(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *hb; + struct futex_hash_bucket *bh; q->fd = fd; q->filp = filp; @@ -906,24 +552,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); get_key_refs(&q->key); - hb = hash_futex(&q->key); - q->lock_ptr = &hb->lock; + bh = hash_futex(&q->key); + q->lock_ptr = &bh->lock; - spin_lock(&hb->lock); - return hb; + spin_lock(&bh->lock); + return bh; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) { - list_add_tail(&q->list, &hb->chain); - q->task = current; - spin_unlock(&hb->lock); + list_add_tail(&q->list, &bh->chain); + spin_unlock(&bh->lock); } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) { - spin_unlock(&hb->lock); + spin_unlock(&bh->lock); drop_key_refs(&q->key); } @@ -935,17 +580,16 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) /* The key must be already stored in q->key. */ static void queue_me(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *hb; - - hb = queue_lock(q, fd, filp); - __queue_me(q, hb); + struct futex_hash_bucket *bh; + bh = queue_lock(q, fd, filp); + __queue_me(q, bh); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { - spinlock_t *lock_ptr; int ret = 0; + spinlock_t *lock_ptr; /* In the common case we don't take the spinlock, which is nice. */ retry: @@ -972,9 +616,6 @@ static int unqueue_me(struct futex_q *q) } WARN_ON(list_empty(&q->list)); list_del(&q->list); - - BUG_ON(q->pi_state); - spin_unlock(lock_ptr); ret = 1; } @@ -983,42 +624,21 @@ static int unqueue_me(struct futex_q *q) return ret; } -/* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock is held on entry and dropped here. - */ -static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) -{ - WARN_ON(list_empty(&q->list)); - list_del(&q->list); - - BUG_ON(!q->pi_state); - free_pi_state(q->pi_state); - q->pi_state = NULL; - - spin_unlock(&hb->lock); - - drop_key_refs(&q->key); -} - -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +static int futex_wait(unsigned long uaddr, int val, unsigned long time) { - struct task_struct *curr = current; - DECLARE_WAITQUEUE(wait, curr); - struct futex_hash_bucket *hb; + DECLARE_WAITQUEUE(wait, current); + int ret, curval; struct futex_q q; - u32 uval; - int ret; + struct futex_hash_bucket *bh; - q.pi_state = NULL; retry: - down_read(&curr->mm->mmap_sem); + down_read(¤t->mm->mmap_sem); ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) goto out_release_sem; - hb = queue_lock(&q, -1, NULL); + bh = queue_lock(&q, -1, NULL); /* * Access the page AFTER the futex is queued. @@ -1040,35 +660,37 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * We hold the mmap semaphore, so the mapping cannot have changed * since we looked it up in get_futex_key. */ - ret = get_futex_value_locked(&uval, uaddr); + + ret = get_futex_value_locked(&curval, (int __user *)uaddr); if (unlikely(ret)) { - queue_unlock(&q, hb); + queue_unlock(&q, bh); - /* - * If we would have faulted, release mmap_sem, fault it in and + /* If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(&curr->mm->mmap_sem); + up_read(¤t->mm->mmap_sem); - ret = get_user(uval, uaddr); + ret = get_user(curval, (int __user *)uaddr); if (!ret) goto retry; return ret; } - ret = -EWOULDBLOCK; - if (uval != val) - goto out_unlock_release_sem; + if (curval != val) { + ret = -EWOULDBLOCK; + queue_unlock(&q, bh); + goto out_release_sem; + } /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, hb); + __queue_me(&q, bh); /* * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. - */ - up_read(&curr->mm->mmap_sem); + */ + up_read(¤t->mm->mmap_sem); /* * There might have been scheduling since the queue_me(), as we @@ -1100,367 +722,12 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) return 0; if (time == 0) return -ETIMEDOUT; - /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. - */ + /* We expect signal_pending(current), but another thread may + * have handled it for us already. */ return -EINTR; - out_unlock_release_sem: - queue_unlock(&q, hb); - - out_release_sem: - up_read(&curr->mm->mmap_sem); - return ret; -} - -/* - * Userspace tried a 0 -> TID atomic transition of the futex value - * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block, it does PI, etc. (Due to - * races the kernel might see a 0 value of the futex too.) - */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, - long nsec, int trylock) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; - struct futex_hash_bucket *hb; - u32 uval, newval, curval; - struct futex_q q; - int ret, attempt = 0; - - if (refill_pi_state_cache()) - return -ENOMEM; - - if (sec != MAX_SCHEDULE_TIMEOUT) { - to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); - hrtimer_init_sleeper(to, current); - to->timer.expires = ktime_set(sec, nsec); - } - - q.pi_state = NULL; - retry: - down_read(&curr->mm->mmap_sem); - - ret = get_futex_key(uaddr, &q.key); - if (unlikely(ret != 0)) - goto out_release_sem; - - hb = queue_lock(&q, -1, NULL); - - retry_locked: - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = current->pid; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); - dec_preempt_count(); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* We own the lock already */ - if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { - if (!detect && 0) - force_sig(SIGKILL, current); - ret = -EDEADLK; - goto out_unlock_release_sem; - } - - /* - * Surprise - we got the lock. Just return - * to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_release_sem; - - uval = curval; - newval = uval | FUTEX_WAITERS; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q); - - if (unlikely(ret)) { - /* - * There were no waiters and the owner task lookup - * failed. When the OWNER_DIED bit is set, then we - * know that this is a robust futex and we actually - * take the lock. This is safe as we are protected by - * the hash bucket lock. We also set the waiters bit - * unconditionally here, to simplify glibc handling of - * multiple tasks racing to acquire the lock and - * cleanup the problems which were left by the dead - * owner. - */ - if (curval & FUTEX_OWNER_DIED) { - uval = newval; - newval = current->pid | - FUTEX_OWNER_DIED | FUTEX_WAITERS; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - dec_preempt_count(); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - ret = 0; - } - goto out_unlock_release_sem; - } - - /* - * Only actually queue now that the atomic ops are done: - */ - __queue_me(&q, hb); - - /* - * Now the futex is queued and we have checked the data, we - * don't want to hold mmap_sem while we sleep. - */ - up_read(&curr->mm->mmap_sem); - - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) - ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); - else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - } - - down_read(&curr->mm->mmap_sem); - spin_lock(q.lock_ptr); - - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case. - */ - if (!ret && q.pi_state->owner != curr) { - u32 newtid = current->pid | FUTEX_WAITERS; - - /* Owner died? */ - if (q.pi_state->owner != NULL) { - spin_lock_irq(&q.pi_state->owner->pi_lock); - WARN_ON(list_empty(&q.pi_state->list)); - list_del_init(&q.pi_state->list); - spin_unlock_irq(&q.pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; - - q.pi_state->owner = current; - - spin_lock_irq(¤t->pi_lock); - WARN_ON(!list_empty(&q.pi_state->list)); - list_add(&q.pi_state->list, ¤t->pi_state_list); - spin_unlock_irq(¤t->pi_lock); - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); - /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. - */ - ret = get_user(uval, uaddr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = futex_atomic_cmpxchg_inatomic(uaddr, - uval, newval); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked - * the hash bucket. - */ - if (ret && q.pi_state->owner == curr) { - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - } - /* Unqueue and drop the lock */ - unqueue_me_pi(&q, hb); - up_read(&curr->mm->mmap_sem); - } - - if (!detect && ret == -EDEADLK && 0) - force_sig(SIGKILL, current); - - return ret != -EINTR ? ret : -ERESTARTNOINTR; - - out_unlock_release_sem: - queue_unlock(&q, hb); - out_release_sem: - up_read(&curr->mm->mmap_sem); - return ret; - - uaddr_faulted: - /* - * We have to r/w *(int __user *)uaddr, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - */ - if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; - goto out_unlock_release_sem; - } - goto retry_locked; - } - - queue_unlock(&q, hb); - up_read(&curr->mm->mmap_sem); - - ret = get_user(uval, uaddr); - if (!ret && (uval != -EFAULT)) - goto retry; - - return ret; -} - -/* - * Userspace attempted a TID -> 0 atomic transition, and failed. - * This is the in-kernel slowpath: we look up the PI state (if any), - * and do the rt-mutex unlock. - */ -static int futex_unlock_pi(u32 __user *uaddr) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - u32 uval; - struct list_head *head; - union futex_key key; - int ret, attempt = 0; - -retry: - if (get_user(uval, uaddr)) - return -EFAULT; - /* - * We release only a lock we actually own: - */ - if ((uval & FUTEX_TID_MASK) != current->pid) - return -EPERM; - /* - * First take all the futex related locks: - */ - down_read(¤t->mm->mmap_sem); - - ret = get_futex_key(uaddr, &key); - if (unlikely(ret != 0)) - goto out; - - hb = hash_futex(&key); - spin_lock(&hb->lock); - -retry_locked: - /* - * To avoid races, try to do the TID -> 0 atomic transition - * again. If it succeeds then we can return without waking - * anyone else up: - */ - if (!(uval & FUTEX_OWNER_DIED)) { - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); - } - - if (unlikely(uval == -EFAULT)) - goto pi_faulted; - /* - * Rare case: we managed to release the lock atomically, - * no need to wake anyone else up: - */ - if (unlikely(uval == current->pid)) - goto out_unlock; - - /* - * Ok, other tasks may need to be woken up - check waiters - * and do the wakeup if necessary: - */ - head = &hb->chain; - - list_for_each_entry_safe(this, next, head, list) { - if (!match_futex (&this->key, &key)) - continue; - ret = wake_futex_pi(uaddr, uval, this); - /* - * The atomic access to the futex value - * generated a pagefault, so retry the - * user-access and the wakeup: - */ - if (ret == -EFAULT) - goto pi_faulted; - goto out_unlock; - } - /* - * No waiters - kernel unlocks the futex: - */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } - -out_unlock: - spin_unlock(&hb->lock); -out: up_read(¤t->mm->mmap_sem); - - return ret; - -pi_faulted: - /* - * We have to r/w *(int __user *)uaddr, but we can't modify it - * non-atomically. Therefore, if get_user below is not - * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. - */ - if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) { - ret = -EFAULT; - goto out_unlock; - } - goto retry_locked; - } - - spin_unlock(&hb->lock); - up_read(¤t->mm->mmap_sem); - - ret = get_user(uval, uaddr); - if (!ret && (uval != -EFAULT)) - goto retry; - return ret; } @@ -1470,7 +737,6 @@ static int futex_close(struct inode *inode, struct file *filp) unqueue_me(q); kfree(q); - return 0; } @@ -1502,7 +768,7 @@ static struct file_operations futex_fops = { * Signal allows caller to avoid the race which would occur if they * set the sigio stuff up afterwards. */ -static int futex_fd(u32 __user *uaddr, int signal) +static int futex_fd(unsigned long uaddr, int signal) { struct futex_q *q; struct file *filp; @@ -1539,7 +805,6 @@ static int futex_fd(u32 __user *uaddr, int signal) err = -ENOMEM; goto error; } - q->pi_state = NULL; down_read(¤t->mm->mmap_sem); err = get_futex_key(uaddr, &q->key); @@ -1577,7 +842,7 @@ error: * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after @@ -1650,9 +915,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - u32 uval, nval, mval; + u32 uval; retry: if (get_user(uval, uaddr)) @@ -1669,44 +934,16 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) - return -1; - - if (nval != uval) + if (futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED) != uval) goto retry; - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi) { - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); - } + if (uval & FUTEX_WAITERS) + futex_wake((unsigned long)uaddr, 1); } return 0; } -/* - * Fetch a robust-list pointer. Bit 0 signals PI futexes: - */ -static inline int fetch_robust_entry(struct robust_list __user **entry, - struct robust_list __user **head, int *pi) -{ - unsigned long uentry; - - if (get_user(uentry, (unsigned long *)head)) - return -EFAULT; - - *entry = (void *)(uentry & ~1UL); - *pi = uentry & 1; - - return 0; -} - /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -1717,14 +954,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int limit = ROBUST_LIST_LIMIT; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (fetch_robust_entry(&entry, &head->list.next, &pi)) + if (get_user(entry, &head->list.next)) return; /* * Fetch the relative futex offset: @@ -1735,25 +972,24 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) + if (get_user(pending, &head->list_op_pending)) return; - if (pending) - handle_futex_death((void *)pending + futex_offset, curr, pip); + handle_futex_death((void *)pending + futex_offset, curr); while (entry != &head->list) { /* * A pending lock might already be on the list, so - * don't process it twice: + * dont process it twice: */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr, pi)) + curr)) return; /* * Fetch the next entry in the list: */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (get_user(entry, &entry->next)) return; /* * Avoid excessively long or circular lists: @@ -1765,8 +1001,8 @@ void exit_robust_list(struct task_struct *curr) } } -long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, - u32 __user *uaddr2, u32 val2, u32 val3) +long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, + unsigned long uaddr2, int val2, int val3) { int ret; @@ -1790,15 +1026,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; - case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, val, timeout, val2, 0); - break; - case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr); - break; - case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); - break; default: ret = -ENOSYS; } @@ -1806,40 +1033,36 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, +asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, struct timespec __user *utime, u32 __user *uaddr2, - u32 val3) + int val3) { struct timespec t; unsigned long timeout = MAX_SCHEDULE_TIMEOUT; - u32 val2 = 0; + int val2 = 0; - if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { + if (utime && (op == FUTEX_WAIT)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + timeout = timespec_to_jiffies(&t) + 1; } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ - if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) - val2 = (u32) (unsigned long) utime; + if (op >= FUTEX_REQUEUE) + val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex((unsigned long)uaddr, op, val, timeout, + (unsigned long)uaddr2, val2, val3); } -static int futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct super_block * +futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); } static struct file_system_type futex_fs_type = {