aboutsummaryrefslogtreecommitdiff
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c732
1 files changed, 532 insertions, 200 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173..b632b5f3f09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,14 +59,121 @@
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
+#include <linux/bootmem.h>
#include <asm/futex.h>
-#include "rtmutex_common.h"
+#include "locking/rtmutex_common.h"
-int __read_mostly futex_cmpxchg_enabled;
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `-------> mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+int __read_mostly futex_cmpxchg_enabled;
+#endif
/*
* Futex flags used to encode options to functions and preserve them across
@@ -143,11 +250,59 @@ static const struct futex_q futex_q_init = {
* waiting on a futex.
*/
struct futex_hash_bucket {
+ atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
+
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static inline void futex_get_mm(union futex_key *key)
+{
+ atomic_inc(&key->private.mm->mm_count);
+ /*
+ * Ensure futex_get_mm() implies a full barrier such that
+ * get_futex_key() implies a full barrier. This is relied upon
+ * as full barrier (B), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
/*
* We hash on the keys returned from get_futex_key (see below).
@@ -157,7 +312,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
u32 hash = jhash2((u32*)&key->both.word,
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
key->both.offset);
- return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+ return &futex_queues[hash & (futex_hashsize - 1)];
}
/*
@@ -183,10 +338,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode);
+ ihold(key->shared.inode); /* implies MB (B) */
break;
case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
+ futex_get_mm(key); /* implies MB (B) */
break;
}
}
@@ -221,10 +376,11 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -246,6 +402,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
return -EINVAL;
address -= key->both.offset;
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -254,11 +413,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
- return -EFAULT;
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
return 0;
}
@@ -283,7 +440,7 @@ again:
put_page(page);
/* serialize against __split_huge_page_splitting() */
local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
page_head = compound_head(page);
/*
* page_head is valid pointer but we must pin
@@ -362,10 +519,10 @@ again:
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = page_head->index;
+ key->shared.pgoff = basepage_index(page);
}
- get_futex_key_refs(key);
+ get_futex_key_refs(key); /* implies MB (B) */
out:
unlock_page(page_head);
@@ -586,27 +743,74 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
}
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
+ */
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct plist_head *head;
struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex(&this->key, key)) {
/*
- * Another waiter already exists - bump up
- * the refcount and return its pi_state:
+ * Sanity check the waiter before increasing
+ * the refcount and attaching to it.
*/
pi_state = this->pi_state;
/*
- * Userspace might have messed up non-PI and PI futexes
+ * Userspace might have messed up non-PI and
+ * PI futexes [3]
*/
if (unlikely(!pi_state))
return -EINVAL;
@@ -614,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
WARN_ON(!atomic_read(&pi_state->refcount));
/*
- * When pi_state->owner is NULL then the owner died
- * and another waiter is on the fly. pi_state->owner
- * is fixed up by the task which acquires
- * pi_state->rt_mutex.
- *
- * We do not check for pid == 0 which can happen when
- * the owner died and robust_list_exit() cleared the
- * TID.
+ * Handle the owner died case:
*/
- if (pid && pi_state->owner) {
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and
+ * wakes the topmost waiter. The task which
+ * acquires the pi_state->rt_mutex will fixup
+ * owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user
+ * space TID is not 0. Inconsistent
+ * state. [5]
+ */
+ if (pid)
+ return -EINVAL;
+ /*
+ * Take a ref on the state and
+ * return. [4]
+ */
+ goto out_state;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner
+ * has not yet executed exit_pi_state_list()
+ * or some waiter acquired the rtmutex in the
+ * pi state, but did not yet fixup the TID in
+ * user space.
+ *
+ * Take a ref on the state and return. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
/*
- * Bail out if user space manipulated the
- * futex value.
+ * If the owner died bit is not set,
+ * then the pi_state must have an
+ * owner. [7]
*/
- if (pid != task_pid_vnr(pi_state->owner))
+ if (!pi_state->owner)
return -EINVAL;
}
+ /*
+ * Bail out if user space manipulated the
+ * futex value. If pi state exists then the
+ * owner TID must be the same as the user
+ * space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+
+ out_state:
atomic_inc(&pi_state->refcount);
*ps = pi_state;
-
return 0;
}
}
/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0
+ * the new pi_state to it, but bail out when TID = 0 [1]
*/
if (!pid)
return -ESRCH;
@@ -649,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
@@ -669,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return ret;
}
+ /*
+ * No existing pi state. First waiter. [2]
+ */
pi_state = alloc_pi_state();
/*
@@ -703,9 +951,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -715,7 +963,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, ownerdied = 0;
+ int lock_taken, ret, force_take = 0;
u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
@@ -740,10 +988,18 @@ retry:
return -EDEADLK;
/*
- * Surprise - we got the lock. Just return to userspace:
+ * Surprise - we got the lock, but we do not trust user space at all.
*/
- if (unlikely(!curval))
- return 1;
+ if (unlikely(!curval)) {
+ /*
+ * We verify whether there is kernel state for this
+ * futex. If not, we can safely assume, that the 0 ->
+ * TID transition is correct. If state exists, we do
+ * not bother to fixup the user space state as it was
+ * corrupted already.
+ */
+ return futex_top_waiter(hb, key) ? -EINVAL : 1;
+ }
uval = curval;
@@ -754,17 +1010,15 @@ retry:
newval = curval | FUTEX_WAITERS;
/*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
+ * Should we force take the futex? See below.
*/
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
+ if (unlikely(force_take)) {
+ /*
+ * Keep the OWNER_DIED and the WAITERS bit and set the
+ * new TID value.
+ */
newval = (curval & ~FUTEX_TID_MASK) | vpid;
- ownerdied = 0;
+ force_take = 0;
lock_taken = 1;
}
@@ -774,7 +1028,7 @@ retry:
goto retry;
/*
- * We took the lock due to owner died take over.
+ * We took the lock due to forced take over.
*/
if (unlikely(lock_taken))
return 1;
@@ -789,20 +1043,25 @@ retry:
switch (ret) {
case -ESRCH:
/*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
+ * We failed to find an owner for this
+ * futex. So we have no pi_state to block
+ * on. This can happen in two cases:
+ *
+ * 1) The owner died
+ * 2) A stale FUTEX_WAITERS bit
+ *
+ * Re-read the futex value.
*/
if (get_futex_value_locked(&curval, uaddr))
return -EFAULT;
/*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
+ * If the owner died or we have a stale
+ * WAITERS bit the owner TID in the user space
+ * futex is 0.
*/
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
+ if (!(curval & FUTEX_TID_MASK)) {
+ force_take = 1;
goto retry;
}
default:
@@ -829,6 +1088,7 @@ static void __unqueue_futex(struct futex_q *q)
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
}
/*
@@ -839,6 +1099,9 @@ static void wake_futex(struct futex_q *q)
{
struct task_struct *p = q->task;
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
* a non-futex wake up happens on another CPU then the task
@@ -867,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ int ret = 0;
if (!pi_state)
return -EINVAL;
@@ -890,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = this->task;
/*
- * We pass it to the next owner. (The WAITERS bit is always
- * kept enabled while there is PI state around. We must also
- * preserve the owner died bit.)
+ * We pass it to the next owner. The WAITERS bit is always
+ * kept enabled while there is PI state around. We cleanup the
+ * owner died bit, because we are the owner.
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- int ret = 0;
-
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -974,7 +1234,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
int ret;
@@ -986,10 +1245,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
goto out;
hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+ goto out_put_key;
+
spin_lock(&hb->lock);
- head = &hb->chain;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
@@ -1007,6 +1270,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
}
spin_unlock(&hb->lock);
+out_put_key:
put_futex_key(&key);
out:
return ret;
@@ -1022,7 +1286,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret;
@@ -1070,10 +1333,12 @@ retry_private:
goto retry;
}
- head = &hb1->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (match_futex (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
@@ -1081,11 +1346,13 @@ retry_private:
}
if (op_ret > 0) {
- head = &hb2->chain;
-
op_ret = 0;
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
if (match_futex (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++op_ret >= nr_wake2)
break;
@@ -1094,6 +1361,7 @@ retry_private:
ret += op_ret;
}
+out_unlock:
double_unlock_hb(hb1, hb2);
out_put_keys:
put_futex_key(&key2);
@@ -1121,7 +1389,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
*/
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -1174,9 +1444,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1187,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1215,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
@@ -1237,8 +1509,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1249,12 +1521,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
int drop_count = 0, task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
- struct plist_head *head1;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
* requeue_pi requires a pi_state, try to allocate it now
* without any locks in case it fails.
*/
@@ -1292,10 +1569,20 @@ retry:
if (unlikely(ret != 0))
goto out_put_key1;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry_private:
+ hb_waiters_inc(hb2);
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -1305,6 +1592,7 @@ retry_private:
if (unlikely(ret)) {
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
ret = get_user(curval, uaddr1);
if (ret)
@@ -1337,16 +1625,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1354,6 +1651,7 @@ retry_private:
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
@@ -1363,6 +1661,7 @@ retry_private:
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
cond_resched();
@@ -1372,8 +1671,7 @@ retry_private:
}
}
- head1 = &hb1->chain;
- plist_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
if (task_count - nr_wake >= nr_requeue)
break;
@@ -1383,9 +1681,13 @@ retry_private:
/*
* FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter)) {
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
ret = -EINVAL;
break;
}
@@ -1435,6 +1737,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
/*
* drop_futex_key_refs() must be called outside the spinlocks. During
@@ -1462,17 +1765,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
struct futex_hash_bucket *hb;
hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock);
+ spin_lock(&hb->lock); /* implies MB (A) */
return hb;
}
static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
spin_unlock(&hb->lock);
+ hb_waiters_dec(hb);
}
/**
@@ -1515,8 +1830,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1686,9 +2001,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1785,7 +2100,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
- schedule();
+ freezable_schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -1803,8 +2118,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -1842,7 +2157,7 @@ retry_private:
ret = get_futex_value_locked(&uval, uaddr);
if (ret) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = get_user(uval, uaddr);
if (ret)
@@ -1856,7 +2171,7 @@ retry_private:
}
if (uval != val) {
- queue_unlock(q, *hb);
+ queue_unlock(*hb);
ret = -EWOULDBLOCK;
}
@@ -2004,7 +2319,7 @@ retry_private:
* Task is exiting and we just wait for the
* exit to complete.
*/
- queue_unlock(&q, hb);
+ queue_unlock(hb);
put_futex_key(&q.key);
cond_resched();
goto retry;
@@ -2056,7 +2371,7 @@ retry_private:
goto out_put_key;
out_unlock_put_key:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
out_put_key:
put_futex_key(&q.key);
@@ -2066,7 +2381,7 @@ out:
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- queue_unlock(&q, hb);
+ queue_unlock(hb);
ret = fault_in_user_writeable(uaddr);
if (ret)
@@ -2088,7 +2403,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
u32 uval, vpid = task_pid_vnr(current);
int ret;
@@ -2112,9 +2426,10 @@ retry:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
- * anyone else up:
+ * anyone else up. We only try this if neither the waiters nor
+ * the owner died bit are set.
*/
- if (!(uval & FUTEX_OWNER_DIED) &&
+ if (!(uval & ~FUTEX_TID_MASK) &&
cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
@@ -2128,9 +2443,7 @@ retry:
* Ok, other tasks may need to be woken up - check waiters
* and do the wakeup if necessary:
*/
- head = &hb->chain;
-
- plist_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -2146,11 +2459,9 @@ retry:
/*
* No waiters - kernel unlocks the futex:
*/
- if (!(uval & FUTEX_OWNER_DIED)) {
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
- }
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
out_unlock:
spin_unlock(&hb->lock);
@@ -2182,9 +2493,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2207,6 +2518,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* Unqueue the futex_q and determine which it was.
*/
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2226,18 +2538,17 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2255,8 +2566,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2271,6 +2582,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
if (!bitset)
return -EINVAL;
@@ -2289,6 +2603,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* code while we sleep on uaddr.
*/
debug_rt_mutex_init_waiter(&rt_waiter);
+ RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&rt_waiter.tree_entry);
rt_waiter.task = NULL;
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2307,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (ret)
goto out_key2;
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (match_futex(&q.key, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -2342,7 +2667,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
* the pi_state.
*/
- WARN_ON(!&q.pi_state);
+ WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2369,7 +2694,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
- if (rt_mutex_owner(pi_mutex) == current)
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
@@ -2443,40 +2768,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
{
struct robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- /* If victim is in different user_ns, then uids are not
- comparable, so we must have CAP_SYS_PTRACE */
- if (cred->user->user_ns != pcred->user->user_ns) {
- if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
- goto ok;
- }
- /* If victim is in same user_ns, then uids are comparable */
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
-ok:
- head = p->robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
@@ -2628,7 +2942,7 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ int cmd = op & FUTEX_CMD_MASK;
unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2955,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
}
switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_UNLOCK_PI:
+ case FUTEX_TRYLOCK_PI:
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, flags, val, timeout, val3);
- break;
+ return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, flags, val, val3);
- break;
+ return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- break;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- break;
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
- break;
+ return futex_lock_pi(uaddr, flags, val, timeout, 0);
case FUTEX_UNLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, flags);
- break;
+ return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
- break;
+ return futex_lock_pi(uaddr, flags, 0, timeout, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- break;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- break;
- default:
- ret = -ENOSYS;
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
}
- return ret;
+ return -ENOSYS;
}
@@ -2720,10 +3029,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
- int i;
/*
* This will fail and we want it. Some arch implementations do
@@ -2737,8 +3046,31 @@ static int __init futex_init(void)
*/
if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ futex_detect_cmpxchg();
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}