diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 732 |
1 files changed, 532 insertions, 200 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index 1614be20173..b632b5f3f09 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -59,14 +59,121 @@ #include <linux/magic.h> #include <linux/pid.h> #include <linux/nsproxy.h> +#include <linux/ptrace.h> +#include <linux/sched/rt.h> +#include <linux/hugetlb.h> +#include <linux/freezer.h> +#include <linux/bootmem.h> #include <asm/futex.h> -#include "rtmutex_common.h" +#include "locking/rtmutex_common.h" -int __read_mostly futex_cmpxchg_enabled; +/* + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * + * waiters++; (a) + * mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `-------> mb(); (B) + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * else wake_waiters(futex); + * waiters--; (b) unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read -- this is done by the barriers in + * get_futex_key_refs(), through either ihold or atomic_inc, depending on the + * futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in queue_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. + */ -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG +int __read_mostly futex_cmpxchg_enabled; +#endif /* * Futex flags used to encode options to functions and preserve them across @@ -143,11 +250,59 @@ static const struct futex_q futex_q_init = { * waiting on a futex. */ struct futex_hash_bucket { + atomic_t waiters; spinlock_t lock; struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; + +static unsigned long __read_mostly futex_hashsize; + +static struct futex_hash_bucket *futex_queues; -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; +static inline void futex_get_mm(union futex_key *key) +{ + atomic_inc(&key->private.mm->mm_count); + /* + * Ensure futex_get_mm() implies a full barrier such that + * get_futex_key() implies a full barrier. This is relied upon + * as full barrier (B), see the ordering comment above. + */ + smp_mb__after_atomic(); +} + +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_inc(&hb->waiters); + /* + * Full barrier (A), see the ordering comment above. + */ + smp_mb__after_atomic(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} + +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + return atomic_read(&hb->waiters); +#else + return 1; +#endif +} /* * We hash on the keys returned from get_futex_key (see below). @@ -157,7 +312,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) u32 hash = jhash2((u32*)&key->both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; + return &futex_queues[hash & (futex_hashsize - 1)]; } /* @@ -183,10 +338,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); + ihold(key->shared.inode); /* implies MB (B) */ break; case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); + futex_get_mm(key); /* implies MB (B) */ break; } } @@ -221,10 +376,11 @@ static void drop_futex_key_refs(union futex_key *key) * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + * * The key words are stored in *key on success. * - * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, + * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -246,6 +402,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + return -EFAULT; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -254,11 +413,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ return 0; } @@ -283,7 +440,7 @@ again: put_page(page); /* serialize against __split_huge_page_splitting() */ local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { page_head = compound_head(page); /* * page_head is valid pointer but we must pin @@ -362,10 +519,10 @@ again: } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = page_head->mapping->host; - key->shared.pgoff = page_head->index; + key->shared.pgoff = basepage_index(page); } - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ out: unlock_page(page_head); @@ -586,27 +743,74 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. + */ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct plist_head *head; struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: + * Sanity check the waiter before increasing + * the refcount and attaching to it. */ pi_state = this->pi_state; /* - * Userspace might have messed up non-PI and PI futexes + * Userspace might have messed up non-PI and + * PI futexes [3] */ if (unlikely(!pi_state)) return -EINVAL; @@ -614,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!atomic_read(&pi_state->refcount)); /* - * When pi_state->owner is NULL then the owner died - * and another waiter is on the fly. pi_state->owner - * is fixed up by the task which acquires - * pi_state->rt_mutex. - * - * We do not check for pid == 0 which can happen when - * the owner died and robust_list_exit() cleared the - * TID. + * Handle the owner died case: */ - if (pid && pi_state->owner) { + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and + * wakes the topmost waiter. The task which + * acquires the pi_state->rt_mutex will fixup + * owner. + */ + if (!pi_state->owner) { + /* + * No pi state owner, but the user + * space TID is not 0. Inconsistent + * state. [5] + */ + if (pid) + return -EINVAL; + /* + * Take a ref on the state and + * return. [4] + */ + goto out_state; + } + + /* + * If TID is 0, then either the dying owner + * has not yet executed exit_pi_state_list() + * or some waiter acquired the rtmutex in the + * pi state, but did not yet fixup the TID in + * user space. + * + * Take a ref on the state and return. [6] + */ + if (!pid) + goto out_state; + } else { /* - * Bail out if user space manipulated the - * futex value. + * If the owner died bit is not set, + * then the pi_state must have an + * owner. [7] */ - if (pid != task_pid_vnr(pi_state->owner)) + if (!pi_state->owner) return -EINVAL; } + /* + * Bail out if user space manipulated the + * futex value. If pi state exists then the + * owner TID must be the same as the user + * space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + + out_state: atomic_inc(&pi_state->refcount); *ps = pi_state; - return 0; } } /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 + * the new pi_state to it, but bail out when TID = 0 [1] */ if (!pid) return -ESRCH; @@ -649,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; + if (!p->mm) { + put_task_struct(p); + return -EPERM; + } + /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit @@ -669,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return ret; } + /* + * No existing pi state. First waiter. [2] + */ pi_state = alloc_pi_state(); /* @@ -703,9 +951,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * be "current" except in the case of requeue pi. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; * <0 - error * * The hb->lock and futex_key refs shall be held by the caller. @@ -715,7 +963,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, ownerdied = 0; + int lock_taken, ret, force_take = 0; u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: @@ -740,10 +988,18 @@ retry: return -EDEADLK; /* - * Surprise - we got the lock. Just return to userspace: + * Surprise - we got the lock, but we do not trust user space at all. */ - if (unlikely(!curval)) - return 1; + if (unlikely(!curval)) { + /* + * We verify whether there is kernel state for this + * futex. If not, we can safely assume, that the 0 -> + * TID transition is correct. If state exists, we do + * not bother to fixup the user space state as it was + * corrupted already. + */ + return futex_top_waiter(hb, key) ? -EINVAL : 1; + } uval = curval; @@ -754,17 +1010,15 @@ retry: newval = curval | FUTEX_WAITERS; /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! + * Should we force take the futex? See below. */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ + if (unlikely(force_take)) { + /* + * Keep the OWNER_DIED and the WAITERS bit and set the + * new TID value. + */ newval = (curval & ~FUTEX_TID_MASK) | vpid; - ownerdied = 0; + force_take = 0; lock_taken = 1; } @@ -774,7 +1028,7 @@ retry: goto retry; /* - * We took the lock due to owner died take over. + * We took the lock due to forced take over. */ if (unlikely(lock_taken)) return 1; @@ -789,20 +1043,25 @@ retry: switch (ret) { case -ESRCH: /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. + * We failed to find an owner for this + * futex. So we have no pi_state to block + * on. This can happen in two cases: + * + * 1) The owner died + * 2) A stale FUTEX_WAITERS bit + * + * Re-read the futex value. */ if (get_futex_value_locked(&curval, uaddr)) return -EFAULT; /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. + * If the owner died or we have a stale + * WAITERS bit the owner TID in the user space + * futex is 0. */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; + if (!(curval & FUTEX_TID_MASK)) { + force_take = 1; goto retry; } default: @@ -829,6 +1088,7 @@ static void __unqueue_futex(struct futex_q *q) hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); } /* @@ -839,6 +1099,9 @@ static void wake_futex(struct futex_q *q) { struct task_struct *p = q->task; + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + /* * We set q->lock_ptr = NULL _before_ we wake up the task. If * a non-futex wake up happens on another CPU then the task @@ -867,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + int ret = 0; if (!pi_state) return -EINVAL; @@ -890,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = this->task; /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) + * We pass it to the next owner. The WAITERS bit is always + * kept enabled while there is PI state around. We cleanup the + * owner died bit, because we are the owner. */ - if (!(uval & FUTEX_OWNER_DIED)) { - int ret = 0; - - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; } raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -974,7 +1234,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; int ret; @@ -986,10 +1245,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) goto out; hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + goto out_put_key; + spin_lock(&hb->lock); - head = &hb->chain; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1007,6 +1270,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); +out_put_key: put_futex_key(&key); out: return ret; @@ -1022,7 +1286,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; struct futex_q *this, *next; int ret, op_ret; @@ -1070,10 +1333,12 @@ retry_private: goto retry; } - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++ret >= nr_wake) break; @@ -1081,11 +1346,13 @@ retry_private: } if (op_ret > 0) { - head = &hb2->chain; - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++op_ret >= nr_wake2) break; @@ -1094,6 +1361,7 @@ retry_private: ret += op_ret; } +out_unlock: double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(&key2); @@ -1121,7 +1389,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); plist_add(&q->list, &hb2->chain); + hb_waiters_inc(hb2); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -1174,9 +1444,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock + * Return: + * 0 - failed to acquire the lock atomically; + * >0 - acquired the lock, return value is vpid of the top_waiter * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1187,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, { struct futex_q *top_waiter = NULL; u32 curval; - int ret; + int ret, vpid; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -1215,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * the contended case or if set_waiters is 1. The pi_state is returned * in ps in contended cases. */ + vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); - if (ret == 1) + if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); - + return vpid; + } return ret; } @@ -1237,8 +1509,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken; * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1249,12 +1521,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, int drop_count = 0, task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; struct futex_q *this, *next; - u32 curval2; if (requeue_pi) { /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + + /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ @@ -1292,10 +1569,20 @@ retry: if (unlikely(ret != 0)) goto out_put_key1; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); retry_private: + hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -1305,6 +1592,7 @@ retry_private: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ret = get_user(curval, uaddr1); if (ret) @@ -1337,16 +1625,25 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. + * reference to it. If the lock was taken, ret contains the + * vpid of the top waiter task. */ - if (ret == 1) { + if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; - ret = get_futex_value_locked(&curval2, uaddr2); - if (!ret) - ret = lookup_pi_state(curval2, hb2, &key2, - &pi_state); + /* + * If we acquired the lock, then the user + * space value of uaddr2 should be vpid. It + * cannot be changed by the top waiter as it + * is blocked on hb2 lock if it tries to do + * so. If something fiddled with it behind our + * back the pi state lookup might unearth + * it. So we rather use the known value than + * rereading and handing potential crap to + * lookup_pi_state. + */ + ret = lookup_pi_state(ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1354,6 +1651,7 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); @@ -1363,6 +1661,7 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); cond_resched(); @@ -1372,8 +1671,7 @@ retry_private: } } - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; @@ -1383,9 +1681,13 @@ retry_private: /* * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter)) { + (!requeue_pi && this->rt_waiter) || + this->pi_state) { ret = -EINVAL; break; } @@ -1435,6 +1737,7 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); /* * drop_futex_key_refs() must be called outside the spinlocks. During @@ -1462,17 +1765,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) struct futex_hash_bucket *hb; hb = hash_futex(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + hb_waiters_inc(hb); + q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); + spin_lock(&hb->lock); /* implies MB (A) */ return hb; } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); + hb_waiters_dec(hb); } /** @@ -1515,8 +1830,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must * be paired with exactly one earlier call to queue_me(). * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); * 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) @@ -1686,9 +2001,9 @@ static long futex_wait_restart(struct restart_block *restart); * the pi_state owner as well as handle race conditions that may allow us to * acquire the lock. Must be called with the hb lock held. * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; * <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1785,7 +2100,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } @@ -1803,8 +2118,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * Return with the hb lock held and a q.key reference on success, and unlocked * with no q.key reference on failure. * - * Returns: - * 0 - uaddr contains val and hb has been locked + * Return: + * 0 - uaddr contains val and hb has been locked; * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -1842,7 +2157,7 @@ retry_private: ret = get_futex_value_locked(&uval, uaddr); if (ret) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = get_user(uval, uaddr); if (ret) @@ -1856,7 +2171,7 @@ retry_private: } if (uval != val) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = -EWOULDBLOCK; } @@ -2004,7 +2319,7 @@ retry_private: * Task is exiting and we just wait for the * exit to complete. */ - queue_unlock(&q, hb); + queue_unlock(hb); put_futex_key(&q.key); cond_resched(); goto retry; @@ -2056,7 +2371,7 @@ retry_private: goto out_put_key; out_unlock_put_key: - queue_unlock(&q, hb); + queue_unlock(hb); out_put_key: put_futex_key(&q.key); @@ -2066,7 +2381,7 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - queue_unlock(&q, hb); + queue_unlock(hb); ret = fault_in_user_writeable(uaddr); if (ret) @@ -2088,7 +2403,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; u32 uval, vpid = task_pid_vnr(current); int ret; @@ -2112,9 +2426,10 @@ retry: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking - * anyone else up: + * anyone else up. We only try this if neither the waiters nor + * the owner died bit are set. */ - if (!(uval & FUTEX_OWNER_DIED) && + if (!(uval & ~FUTEX_TID_MASK) && cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* @@ -2128,9 +2443,7 @@ retry: * Ok, other tasks may need to be woken up - check waiters * and do the wakeup if necessary: */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); @@ -2146,11 +2459,9 @@ retry: /* * No waiters - kernel unlocks the futex: */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; out_unlock: spin_unlock(&hb->lock); @@ -2182,9 +2493,9 @@ pi_faulted: * the wakeup and return the appropriate error code to the caller. Must be * called with the hb lock held. * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2207,6 +2518,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2226,18 +2538,17 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal @@ -2255,8 +2566,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * - * Returns: - * 0 - On success + * Return: + * 0 - On success; * <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, @@ -2271,6 +2582,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; @@ -2289,6 +2603,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * code while we sleep on uaddr. */ debug_rt_mutex_init_waiter(&rt_waiter); + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); + RB_CLEAR_NODE(&rt_waiter.tree_entry); rt_waiter.task = NULL; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); @@ -2307,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret) goto out_key2; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (match_futex(&q.key, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -2342,7 +2667,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); @@ -2369,7 +2694,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* @@ -2443,40 +2768,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, { struct robust_list_head __user *head; unsigned long ret; - const struct cred *cred = current_cred(), *pcred; + struct task_struct *p; if (!futex_cmpxchg_enabled) return -ENOSYS; + rcu_read_lock(); + + ret = -ESRCH; if (!pid) - head = current->robust_list; + p = current; else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->robust_list; - rcu_read_unlock(); } + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->robust_list; + rcu_read_unlock(); + if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); @@ -2628,7 +2942,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; + int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; if (!(op & FUTEX_PRIVATE_FLAG)) @@ -2641,49 +2955,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + if (!futex_cmpxchg_enabled) + return -ENOSYS; + } + + switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, flags, val, timeout, val3); - break; + return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, flags, val, val3); - break; + return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); - break; + return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); - break; + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); - break; + return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, val, timeout, 0); - break; + return futex_lock_pi(uaddr, flags, val, timeout, 0); case FUTEX_UNLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, flags); - break; + return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); - break; + return futex_lock_pi(uaddr, flags, 0, timeout, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, - uaddr2); - break; + return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); - break; - default: - ret = -ENOSYS; + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); } - return ret; + return -ENOSYS; } @@ -2720,10 +3029,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; - int i; /* * This will fail and we want it. Some arch implementations do @@ -2737,8 +3046,31 @@ static int __init futex_init(void) */ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; + + futex_detect_cmpxchg(); - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } |
