diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 516 | 
1 files changed, 419 insertions, 97 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index c3a1a55a521..b632b5f3f09 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,14 +63,117 @@  #include <linux/sched/rt.h>  #include <linux/hugetlb.h>  #include <linux/freezer.h> +#include <linux/bootmem.h>  #include <asm/futex.h> -#include "rtmutex_common.h" +#include "locking/rtmutex_common.h" -int __read_mostly futex_cmpxchg_enabled; +/* + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0                               CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + *   futex_wait(futex, val); + *   uval = *futex; + *                                     *futex = newval; + *                                     sys_futex(WAKE, futex); + *                                       futex_wake(futex); + *                                       if (queue_empty()) + *                                         return; + *   if (uval == val) + *      lock(hash_bucket(futex)); + *      queue(); + *     unlock(hash_bucket(futex)); + *     schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0                                 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + *   futex_wait(futex, val); + * + *   waiters++; (a) + *   mb(); (A) <-- paired with -. + *                              | + *   lock(hash_bucket(futex));  | + *                              | + *   uval = *futex;             | + *                              |        *futex = newval; + *                              |        sys_futex(WAKE, futex); + *                              |          futex_wake(futex); + *                              | + *                              `------->  mb(); (B) + *   if (uval == val) + *     queue(); + *     unlock(hash_bucket(futex)); + *     schedule();                         if (waiters) + *                                           lock(hash_bucket(futex)); + *   else                                    wake_waiters(futex); + *     waiters--; (b)                        unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read -- this is done by the barriers in + * get_futex_key_refs(), through either ihold or atomic_inc, depending on the + * futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + *	X = Y = 0 + * + *	w[X]=1		w[Y]=1 + *	MB		MB + *	r[Y]=y		r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in queue_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again  after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. + */ -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG +int __read_mostly futex_cmpxchg_enabled; +#endif  /*   * Futex flags used to encode options to functions and preserve them across @@ -147,11 +250,59 @@ static const struct futex_q futex_q_init = {   * waiting on a futex.   */  struct futex_hash_bucket { +	atomic_t waiters;  	spinlock_t lock;  	struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; + +static unsigned long __read_mostly futex_hashsize; + +static struct futex_hash_bucket *futex_queues; + +static inline void futex_get_mm(union futex_key *key) +{ +	atomic_inc(&key->private.mm->mm_count); +	/* +	 * Ensure futex_get_mm() implies a full barrier such that +	 * get_futex_key() implies a full barrier. This is relied upon +	 * as full barrier (B), see the ordering comment above. +	 */ +	smp_mb__after_atomic(); +} + +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP +	atomic_inc(&hb->waiters); +	/* +	 * Full barrier (A), see the ordering comment above. +	 */ +	smp_mb__after_atomic(); +#endif +} -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP +	atomic_dec(&hb->waiters); +#endif +} + +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP +	return atomic_read(&hb->waiters); +#else +	return 1; +#endif +}  /*   * We hash on the keys returned from get_futex_key (see below). @@ -161,7 +312,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)  	u32 hash = jhash2((u32*)&key->both.word,  			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,  			  key->both.offset); -	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; +	return &futex_queues[hash & (futex_hashsize - 1)];  }  /* @@ -187,10 +338,10 @@ static void get_futex_key_refs(union futex_key *key)  	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {  	case FUT_OFF_INODE: -		ihold(key->shared.inode); +		ihold(key->shared.inode); /* implies MB (B) */  		break;  	case FUT_OFF_MMSHARED: -		atomic_inc(&key->private.mm->mm_count); +		futex_get_mm(key); /* implies MB (B) */  		break;  	}  } @@ -251,6 +402,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)  		return -EINVAL;  	address -= key->both.offset; +	if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) +		return -EFAULT; +  	/*  	 * PROCESS_PRIVATE futexes are fast.  	 * As the mm cannot disappear under us and the 'key' only needs @@ -259,11 +413,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)  	 *        but access_ok() should be faster than find_vma()  	 */  	if (!fshared) { -		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) -			return -EFAULT;  		key->private.mm = mm;  		key->private.address = address; -		get_futex_key_refs(key); +		get_futex_key_refs(key);  /* implies MB (B) */  		return 0;  	} @@ -288,7 +440,7 @@ again:  		put_page(page);  		/* serialize against __split_huge_page_splitting() */  		local_irq_disable(); -		if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { +		if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {  			page_head = compound_head(page);  			/*  			 * page_head is valid pointer but we must pin @@ -370,7 +522,7 @@ again:  		key->shared.pgoff = basepage_index(page);  	} -	get_futex_key_refs(key); +	get_futex_key_refs(key); /* implies MB (B) */  out:  	unlock_page(page_head); @@ -591,27 +743,74 @@ void exit_pi_state_list(struct task_struct *curr)  	raw_spin_unlock_irq(&curr->pi_lock);  } +/* + * We need to check the following states: + * + *      Waiter | pi_state | pi->owner | uTID      | uODIED | ? + * + * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid + * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid + * + * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid + * + * [4]  Found  | Found    | NULL      | 0         | 1      | Valid + * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid + * + * [6]  Found  | Found    | task      | 0         | 1      | Valid + * + * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid + * + * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid + * [9]  Found  | Found    | task      | 0         | 0      | Invalid + * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid + * + * [1]	Indicates that the kernel can acquire the futex atomically. We + *	came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2]	Valid, if TID does not belong to a kernel thread. If no matching + *      thread is found then it indicates that the owner TID has died. + * + * [3]	Invalid. The waiter is queued on a non PI futex + * + * [4]	Valid state after exit_robust_list(), which sets the user space + *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5]	The user space value got manipulated between exit_robust_list() + *	and exit_pi_state_list() + * + * [6]	Valid state after exit_pi_state_list() which sets the new owner in + *	the pi_state but cannot access the user space value. + * + * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8]	Owner and user space value match + * + * [9]	There is no transient state which sets the user space TID to 0 + *	except exit_robust_list(), but this is indicated by the + *	FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + *	TID out of sync. + */  static int  lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  		union futex_key *key, struct futex_pi_state **ps)  {  	struct futex_pi_state *pi_state = NULL;  	struct futex_q *this, *next; -	struct plist_head *head;  	struct task_struct *p;  	pid_t pid = uval & FUTEX_TID_MASK; -	head = &hb->chain; - -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (match_futex(&this->key, key)) {  			/* -			 * Another waiter already exists - bump up -			 * the refcount and return its pi_state: +			 * Sanity check the waiter before increasing +			 * the refcount and attaching to it.  			 */  			pi_state = this->pi_state;  			/* -			 * Userspace might have messed up non-PI and PI futexes +			 * Userspace might have messed up non-PI and +			 * PI futexes [3]  			 */  			if (unlikely(!pi_state))  				return -EINVAL; @@ -619,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  			WARN_ON(!atomic_read(&pi_state->refcount));  			/* -			 * When pi_state->owner is NULL then the owner died -			 * and another waiter is on the fly. pi_state->owner -			 * is fixed up by the task which acquires -			 * pi_state->rt_mutex. -			 * -			 * We do not check for pid == 0 which can happen when -			 * the owner died and robust_list_exit() cleared the -			 * TID. +			 * Handle the owner died case:  			 */ -			if (pid && pi_state->owner) { +			if (uval & FUTEX_OWNER_DIED) { +				/* +				 * exit_pi_state_list sets owner to NULL and +				 * wakes the topmost waiter. The task which +				 * acquires the pi_state->rt_mutex will fixup +				 * owner. +				 */ +				if (!pi_state->owner) { +					/* +					 * No pi state owner, but the user +					 * space TID is not 0. Inconsistent +					 * state. [5] +					 */ +					if (pid) +						return -EINVAL; +					/* +					 * Take a ref on the state and +					 * return. [4] +					 */ +					goto out_state; +				} +  				/* -				 * Bail out if user space manipulated the -				 * futex value. +				 * If TID is 0, then either the dying owner +				 * has not yet executed exit_pi_state_list() +				 * or some waiter acquired the rtmutex in the +				 * pi state, but did not yet fixup the TID in +				 * user space. +				 * +				 * Take a ref on the state and return. [6]  				 */ -				if (pid != task_pid_vnr(pi_state->owner)) +				if (!pid) +					goto out_state; +			} else { +				/* +				 * If the owner died bit is not set, +				 * then the pi_state must have an +				 * owner. [7] +				 */ +				if (!pi_state->owner)  					return -EINVAL;  			} +			/* +			 * Bail out if user space manipulated the +			 * futex value. If pi state exists then the +			 * owner TID must be the same as the user +			 * space TID. [9/10] +			 */ +			if (pid != task_pid_vnr(pi_state->owner)) +				return -EINVAL; + +		out_state:  			atomic_inc(&pi_state->refcount);  			*ps = pi_state; -  			return 0;  		}  	}  	/*  	 * We are the first waiter - try to look up the real owner and attach -	 * the new pi_state to it, but bail out when TID = 0 +	 * the new pi_state to it, but bail out when TID = 0 [1]  	 */  	if (!pid)  		return -ESRCH; @@ -654,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  	if (!p)  		return -ESRCH; +	if (!p->mm) { +		put_task_struct(p); +		return -EPERM; +	} +  	/*  	 * We need to look at the task state flags to figure out,  	 * whether the task is exiting. To protect against the do_exit @@ -674,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  		return ret;  	} +	/* +	 * No existing pi state. First waiter. [2] +	 */  	pi_state = alloc_pi_state();  	/* @@ -745,10 +988,18 @@ retry:  		return -EDEADLK;  	/* -	 * Surprise - we got the lock. Just return to userspace: +	 * Surprise - we got the lock, but we do not trust user space at all.  	 */ -	if (unlikely(!curval)) -		return 1; +	if (unlikely(!curval)) { +		/* +		 * We verify whether there is kernel state for this +		 * futex. If not, we can safely assume, that the 0 -> +		 * TID transition is correct. If state exists, we do +		 * not bother to fixup the user space state as it was +		 * corrupted already. +		 */ +		return futex_top_waiter(hb, key) ? -EINVAL : 1; +	}  	uval = curval; @@ -837,6 +1088,7 @@ static void __unqueue_futex(struct futex_q *q)  	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);  	plist_del(&q->list, &hb->chain); +	hb_waiters_dec(hb);  }  /* @@ -878,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)  	struct task_struct *new_owner;  	struct futex_pi_state *pi_state = this->pi_state;  	u32 uninitialized_var(curval), newval; +	int ret = 0;  	if (!pi_state)  		return -EINVAL; @@ -901,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)  		new_owner = this->task;  	/* -	 * We pass it to the next owner. (The WAITERS bit is always -	 * kept enabled while there is PI state around. We must also -	 * preserve the owner died bit.) +	 * We pass it to the next owner. The WAITERS bit is always +	 * kept enabled while there is PI state around. We cleanup the +	 * owner died bit, because we are the owner.  	 */ -	if (!(uval & FUTEX_OWNER_DIED)) { -		int ret = 0; - -		newval = FUTEX_WAITERS | task_pid_vnr(new_owner); +	newval = FUTEX_WAITERS | task_pid_vnr(new_owner); -		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) -			ret = -EFAULT; -		else if (curval != uval) -			ret = -EINVAL; -		if (ret) { -			raw_spin_unlock(&pi_state->pi_mutex.wait_lock); -			return ret; -		} +	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) +		ret = -EFAULT; +	else if (curval != uval) +		ret = -EINVAL; +	if (ret) { +		raw_spin_unlock(&pi_state->pi_mutex.wait_lock); +		return ret;  	}  	raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -985,7 +1234,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)  {  	struct futex_hash_bucket *hb;  	struct futex_q *this, *next; -	struct plist_head *head;  	union futex_key key = FUTEX_KEY_INIT;  	int ret; @@ -997,10 +1245,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)  		goto out;  	hb = hash_futex(&key); + +	/* Make sure we really have tasks to wakeup */ +	if (!hb_waiters_pending(hb)) +		goto out_put_key; +  	spin_lock(&hb->lock); -	head = &hb->chain; -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (match_futex (&this->key, &key)) {  			if (this->pi_state || this->rt_waiter) {  				ret = -EINVAL; @@ -1018,6 +1270,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)  	}  	spin_unlock(&hb->lock); +out_put_key:  	put_futex_key(&key);  out:  	return ret; @@ -1033,7 +1286,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,  {  	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;  	struct futex_hash_bucket *hb1, *hb2; -	struct plist_head *head;  	struct futex_q *this, *next;  	int ret, op_ret; @@ -1081,9 +1333,7 @@ retry_private:  		goto retry;  	} -	head = &hb1->chain; - -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb1->chain, list) {  		if (match_futex (&this->key, &key1)) {  			if (this->pi_state || this->rt_waiter) {  				ret = -EINVAL; @@ -1096,10 +1346,8 @@ retry_private:  	}  	if (op_ret > 0) { -		head = &hb2->chain; -  		op_ret = 0; -		plist_for_each_entry_safe(this, next, head, list) { +		plist_for_each_entry_safe(this, next, &hb2->chain, list) {  			if (match_futex (&this->key, &key2)) {  				if (this->pi_state || this->rt_waiter) {  					ret = -EINVAL; @@ -1141,7 +1389,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,  	 */  	if (likely(&hb1->chain != &hb2->chain)) {  		plist_del(&q->list, &hb1->chain); +		hb_waiters_dec(hb1);  		plist_add(&q->list, &hb2->chain); +		hb_waiters_inc(hb2);  		q->lock_ptr = &hb2->lock;  	}  	get_futex_key_refs(key2); @@ -1196,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,   *   * Return:   *  0 - failed to acquire the lock atomically; - *  1 - acquired the lock; + * >0 - acquired the lock, return value is vpid of the top_waiter   * <0 - error   */  static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1207,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,  {  	struct futex_q *top_waiter = NULL;  	u32 curval; -	int ret; +	int ret, vpid;  	if (get_futex_value_locked(&curval, pifutex))  		return -EFAULT; @@ -1235,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,  	 * the contended case or if set_waiters is 1.  The pi_state is returned  	 * in ps in contended cases.  	 */ +	vpid = task_pid_vnr(top_waiter->task);  	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,  				   set_waiters); -	if (ret == 1) +	if (ret == 1) {  		requeue_pi_wake_futex(top_waiter, key2, hb2); - +		return vpid; +	}  	return ret;  } @@ -1269,12 +1521,17 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,  	int drop_count = 0, task_count = 0, ret;  	struct futex_pi_state *pi_state = NULL;  	struct futex_hash_bucket *hb1, *hb2; -	struct plist_head *head1;  	struct futex_q *this, *next; -	u32 curval2;  	if (requeue_pi) {  		/* +		 * Requeue PI only works on two distinct uaddrs. This +		 * check is only valid for private futexes. See below. +		 */ +		if (uaddr1 == uaddr2) +			return -EINVAL; + +		/*  		 * requeue_pi requires a pi_state, try to allocate it now  		 * without any locks in case it fails.  		 */ @@ -1312,10 +1569,20 @@ retry:  	if (unlikely(ret != 0))  		goto out_put_key1; +	/* +	 * The check above which compares uaddrs is not sufficient for +	 * shared futexes. We need to compare the keys: +	 */ +	if (requeue_pi && match_futex(&key1, &key2)) { +		ret = -EINVAL; +		goto out_put_keys; +	} +  	hb1 = hash_futex(&key1);  	hb2 = hash_futex(&key2);  retry_private: +	hb_waiters_inc(hb2);  	double_lock_hb(hb1, hb2);  	if (likely(cmpval != NULL)) { @@ -1325,6 +1592,7 @@ retry_private:  		if (unlikely(ret)) {  			double_unlock_hb(hb1, hb2); +			hb_waiters_dec(hb2);  			ret = get_user(curval, uaddr1);  			if (ret) @@ -1357,16 +1625,25 @@ retry_private:  		 * At this point the top_waiter has either taken uaddr2 or is  		 * waiting on it.  If the former, then the pi_state will not  		 * exist yet, look it up one more time to ensure we have a -		 * reference to it. +		 * reference to it. If the lock was taken, ret contains the +		 * vpid of the top waiter task.  		 */ -		if (ret == 1) { +		if (ret > 0) {  			WARN_ON(pi_state);  			drop_count++;  			task_count++; -			ret = get_futex_value_locked(&curval2, uaddr2); -			if (!ret) -				ret = lookup_pi_state(curval2, hb2, &key2, -						      &pi_state); +			/* +			 * If we acquired the lock, then the user +			 * space value of uaddr2 should be vpid. It +			 * cannot be changed by the top waiter as it +			 * is blocked on hb2 lock if it tries to do +			 * so. If something fiddled with it behind our +			 * back the pi state lookup might unearth +			 * it. So we rather use the known value than +			 * rereading and handing potential crap to +			 * lookup_pi_state. +			 */ +			ret = lookup_pi_state(ret, hb2, &key2, &pi_state);  		}  		switch (ret) { @@ -1374,6 +1651,7 @@ retry_private:  			break;  		case -EFAULT:  			double_unlock_hb(hb1, hb2); +			hb_waiters_dec(hb2);  			put_futex_key(&key2);  			put_futex_key(&key1);  			ret = fault_in_user_writeable(uaddr2); @@ -1383,6 +1661,7 @@ retry_private:  		case -EAGAIN:  			/* The owner was exiting, try again. */  			double_unlock_hb(hb1, hb2); +			hb_waiters_dec(hb2);  			put_futex_key(&key2);  			put_futex_key(&key1);  			cond_resched(); @@ -1392,8 +1671,7 @@ retry_private:  		}  	} -	head1 = &hb1->chain; -	plist_for_each_entry_safe(this, next, head1, list) { +	plist_for_each_entry_safe(this, next, &hb1->chain, list) {  		if (task_count - nr_wake >= nr_requeue)  			break; @@ -1459,6 +1737,7 @@ retry_private:  out_unlock:  	double_unlock_hb(hb1, hb2); +	hb_waiters_dec(hb2);  	/*  	 * drop_futex_key_refs() must be called outside the spinlocks. During @@ -1486,17 +1765,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)  	struct futex_hash_bucket *hb;  	hb = hash_futex(&q->key); + +	/* +	 * Increment the counter before taking the lock so that +	 * a potential waker won't miss a to-be-slept task that is +	 * waiting for the spinlock. This is safe as all queue_lock() +	 * users end up calling queue_me(). Similarly, for housekeeping, +	 * decrement the counter at queue_unlock() when some error has +	 * occurred and we don't end up adding the task to the list. +	 */ +	hb_waiters_inc(hb); +  	q->lock_ptr = &hb->lock; -	spin_lock(&hb->lock); +	spin_lock(&hb->lock); /* implies MB (A) */  	return hb;  }  static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb)  	__releases(&hb->lock)  {  	spin_unlock(&hb->lock); +	hb_waiters_dec(hb);  }  /** @@ -1866,7 +2157,7 @@ retry_private:  	ret = get_futex_value_locked(&uval, uaddr);  	if (ret) { -		queue_unlock(q, *hb); +		queue_unlock(*hb);  		ret = get_user(uval, uaddr);  		if (ret) @@ -1880,7 +2171,7 @@ retry_private:  	}  	if (uval != val) { -		queue_unlock(q, *hb); +		queue_unlock(*hb);  		ret = -EWOULDBLOCK;  	} @@ -2028,7 +2319,7 @@ retry_private:  			 * Task is exiting and we just wait for the  			 * exit to complete.  			 */ -			queue_unlock(&q, hb); +			queue_unlock(hb);  			put_futex_key(&q.key);  			cond_resched();  			goto retry; @@ -2080,7 +2371,7 @@ retry_private:  	goto out_put_key;  out_unlock_put_key: -	queue_unlock(&q, hb); +	queue_unlock(hb);  out_put_key:  	put_futex_key(&q.key); @@ -2090,7 +2381,7 @@ out:  	return ret != -EINTR ? ret : -ERESTARTNOINTR;  uaddr_faulted: -	queue_unlock(&q, hb); +	queue_unlock(hb);  	ret = fault_in_user_writeable(uaddr);  	if (ret) @@ -2112,7 +2403,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)  {  	struct futex_hash_bucket *hb;  	struct futex_q *this, *next; -	struct plist_head *head;  	union futex_key key = FUTEX_KEY_INIT;  	u32 uval, vpid = task_pid_vnr(current);  	int ret; @@ -2136,9 +2426,10 @@ retry:  	/*  	 * To avoid races, try to do the TID -> 0 atomic transition  	 * again. If it succeeds then we can return without waking -	 * anyone else up: +	 * anyone else up. We only try this if neither the waiters nor +	 * the owner died bit are set.  	 */ -	if (!(uval & FUTEX_OWNER_DIED) && +	if (!(uval & ~FUTEX_TID_MASK) &&  	    cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))  		goto pi_faulted;  	/* @@ -2152,9 +2443,7 @@ retry:  	 * Ok, other tasks may need to be woken up - check waiters  	 * and do the wakeup if necessary:  	 */ -	head = &hb->chain; - -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (!match_futex (&this->key, &key))  			continue;  		ret = wake_futex_pi(uaddr, uval, this); @@ -2170,11 +2459,9 @@ retry:  	/*  	 * No waiters - kernel unlocks the futex:  	 */ -	if (!(uval & FUTEX_OWNER_DIED)) { -		ret = unlock_futex_pi(uaddr, uval); -		if (ret == -EFAULT) -			goto pi_faulted; -	} +	ret = unlock_futex_pi(uaddr, uval); +	if (ret == -EFAULT) +		goto pi_faulted;  out_unlock:  	spin_unlock(&hb->lock); @@ -2231,6 +2518,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,  		 * Unqueue the futex_q and determine which it was.  		 */  		plist_del(&q->list, &hb->chain); +		hb_waiters_dec(hb);  		/* Handle spurious wakeups gracefully */  		ret = -EWOULDBLOCK; @@ -2315,6 +2603,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  	 * code while we sleep on uaddr.  	 */  	debug_rt_mutex_init_waiter(&rt_waiter); +	RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); +	RB_CLEAR_NODE(&rt_waiter.tree_entry);  	rt_waiter.task = NULL;  	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); @@ -2333,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  	if (ret)  		goto out_key2; +	/* +	 * The check above which compares uaddrs is not sufficient for +	 * shared futexes. We need to compare the keys: +	 */ +	if (match_futex(&q.key, &key2)) { +		ret = -EINVAL; +		goto out_put_keys; +	} +  	/* Queue the futex_q, drop the hb lock, wait for wakeup. */  	futex_wait_queue_me(hb, &q, to); @@ -2730,10 +3029,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,  	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);  } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void)  { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG  	u32 curval; -	int i;  	/*  	 * This will fail and we want it. Some arch implementations do @@ -2747,8 +3046,31 @@ static int __init futex_init(void)  	 */  	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)  		futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ +	unsigned int futex_shift; +	unsigned long i; + +#if CONFIG_BASE_SMALL +	futex_hashsize = 16; +#else +	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + +	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), +					       futex_hashsize, 0, +					       futex_hashsize < 256 ? HASH_SMALL : 0, +					       &futex_shift, NULL, +					       futex_hashsize, futex_hashsize); +	futex_hashsize = 1UL << futex_shift; + +	futex_detect_cmpxchg(); -	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { +	for (i = 0; i < futex_hashsize; i++) { +		atomic_set(&futex_queues[i].waiters, 0);  		plist_head_init(&futex_queues[i].chain);  		spin_lock_init(&futex_queues[i].lock);  	}  | 
