diff options
Diffstat (limited to 'kernel/locking')
| -rw-r--r-- | kernel/locking/Makefile | 5 | ||||
| -rw-r--r-- | kernel/locking/lockdep.c | 23 | ||||
| -rw-r--r-- | kernel/locking/lockdep_internals.h | 6 | ||||
| -rw-r--r-- | kernel/locking/locktorture.c | 454 | ||||
| -rw-r--r-- | kernel/locking/mcs_spinlock.c | 210 | ||||
| -rw-r--r-- | kernel/locking/mcs_spinlock.h | 130 | ||||
| -rw-r--r-- | kernel/locking/mutex-debug.c | 23 | ||||
| -rw-r--r-- | kernel/locking/mutex.c | 104 | ||||
| -rw-r--r-- | kernel/locking/qrwlock.c | 133 | ||||
| -rw-r--r-- | kernel/locking/rtmutex-debug.h | 5 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 285 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.h | 5 | ||||
| -rw-r--r-- | kernel/locking/rwsem-spinlock.c | 28 | ||||
| -rw-r--r-- | kernel/locking/rwsem-xadd.c | 278 | ||||
| -rw-r--r-- | kernel/locking/rwsem.c | 31 |
15 files changed, 1545 insertions, 175 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f6..8541bfdfd23 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o @@ -23,3 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o +obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa..d24e4339b46 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; + hlock = curr->held_locks + depth - 1; /* * Only non-recursive-read entries get new dependencies * added: */ - if (hlock->read != 2) { + if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, distance, trylock_loop)) return 0; @@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ - if (!hlock->trylock && (hlock->check == 2) && + if (!hlock->trylock && hlock->check && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: @@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + if (!hlock->check) continue; if (!mark_lock(curr, hlock, usage_bit)) @@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -void trace_hardirqs_on_caller(unsigned long ip) +__visible void trace_hardirqs_on_caller(unsigned long ip) { time_hardirqs_on(CALLER_ADDR0, ip); @@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off_caller(unsigned long ip) +__visible void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; @@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; - if (!prove_locking) - check = 1; - if (unlikely(!debug_locks)) return 0; @@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (lock->key == &__lockdep_no_validate__) - check = 1; + if (!prove_locking || lock->key == &__lockdep_no_validate__) + check = 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->holdtime_stamp = lockstat_clock(); #endif - if (check == 2 && !mark_irqflags(curr, hlock)) + if (check && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ @@ -4191,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); -void lockdep_sys_exit(void) +asmlinkage __visible void lockdep_sys_exit(void) { struct task_struct *curr = current; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8..51c4b24b632 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_ENTRIES 32768UL -#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS_BITS 16 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -65,7 +65,7 @@ enum { * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 262144UL +#define MAX_STACK_TRACE_ENTRIES 524288UL extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c new file mode 100644 index 00000000000..0955b885d0d --- /dev/null +++ b/kernel/locking/locktorture.c @@ -0,0 +1,454 @@ +/* + * Module-based torture test facility for locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Based on kernel/rcu/torture.c. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/trace_clock.h> +#include <asm/byteorder.h> +#include <linux/torture.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); + +torture_param(int, nwriters_stress, -1, + "Number of write-locking stress-test threads"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, + "Number of jiffies between shuffles, 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); + +static char *torture_type = "spin_lock"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, + "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); + +static atomic_t n_lock_torture_errors; + +static struct task_struct *stats_task; +static struct task_struct **writer_tasks; + +static int nrealwriters_stress; +static bool lock_is_write_held; + +struct lock_writer_stress_stats { + long n_write_lock_fail; + long n_write_lock_acquired; +}; +static struct lock_writer_stress_stats *lwsa; + +#if defined(MODULE) +#define LOCKTORTURE_RUNNABLE_INIT 1 +#else +#define LOCKTORTURE_RUNNABLE_INIT 0 +#endif +int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(locktorture_runnable, int, 0444); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); + +/* Forward reference. */ +static void lock_torture_cleanup(void); + +/* + * Operations vector for selecting different types of tests. + */ +struct lock_torture_ops { + void (*init)(void); + int (*writelock)(void); + void (*write_delay)(struct torture_random_state *trsp); + void (*writeunlock)(void); + unsigned long flags; + const char *name; +}; + +static struct lock_torture_ops *cur_ops; + +/* + * Definitions for lock torture testing. + */ + +static int torture_lock_busted_write_lock(void) +{ + return 0; /* BUGGY, do not use in real life!!! */ +} + +static void torture_lock_busted_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_us = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_lock_busted_write_unlock(void) +{ + /* BUGGY, do not use in real life!!! */ +} + +static struct lock_torture_ops lock_busted_ops = { + .writelock = torture_lock_busted_write_lock, + .write_delay = torture_lock_busted_write_delay, + .writeunlock = torture_lock_busted_write_unlock, + .name = "lock_busted" +}; + +static DEFINE_SPINLOCK(torture_spinlock); + +static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +{ + spin_lock(&torture_spinlock); + return 0; +} + +static void torture_spin_lock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_us = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); + if (!(torture_random(trsp) % + (nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +{ + spin_unlock(&torture_spinlock); +} + +static struct lock_torture_ops spin_lock_ops = { + .writelock = torture_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_spin_lock_write_unlock, + .name = "spin_lock" +}; + +static int torture_spin_lock_write_lock_irq(void) +__acquires(torture_spinlock_irq) +{ + unsigned long flags; + + spin_lock_irqsave(&torture_spinlock, flags); + cur_ops->flags = flags; + return 0; +} + +static void torture_lock_spin_write_unlock_irq(void) +__releases(torture_spinlock) +{ + spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); +} + +static struct lock_torture_ops spin_lock_irq_ops = { + .writelock = torture_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_lock_spin_write_unlock_irq, + .name = "spin_lock_irq" +}; + +/* + * Lock torture writer kthread. Repeatedly acquires and releases + * the lock, checking for duplicate acquisitions. + */ +static int lock_torture_writer(void *arg) +{ + struct lock_writer_stress_stats *lwsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_writer task started"); + set_user_nice(current, MAX_NICE); + + do { + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); + cur_ops->writelock(); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_write_lock_fail++; + lock_is_write_held = 1; + lwsp->n_write_lock_acquired++; + cur_ops->write_delay(&rand); + lock_is_write_held = 0; + cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_writer"); + return 0; +} + +/* + * Create an lock-torture-statistics message in the specified buffer. + */ +static void lock_torture_printk(char *page) +{ + bool fail = 0; + int i; + long max = 0; + long min = lwsa[0].n_write_lock_acquired; + long long sum = 0; + + for (i = 0; i < nrealwriters_stress; i++) { + if (lwsa[i].n_write_lock_fail) + fail = true; + sum += lwsa[i].n_write_lock_acquired; + if (max < lwsa[i].n_write_lock_fail) + max = lwsa[i].n_write_lock_fail; + if (min > lwsa[i].n_write_lock_fail) + min = lwsa[i].n_write_lock_fail; + } + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, + "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + sum, max, min, max / 2 > min ? "???" : "", + fail, fail ? "!!!" : ""); + if (fail) + atomic_inc(&n_lock_torture_errors); +} + +/* + * Print torture statistics. Caller must ensure that there is only one + * call to this function at a given time!!! This is normally accomplished + * by relying on the module system to only have one copy of the module + * loaded, and then by giving the lock_torture_stats kthread full control + * (or the init/cleanup functions when lock_torture_stats thread is not + * running). + */ +static void lock_torture_stats_print(void) +{ + int size = nrealwriters_stress * 200 + 8192; + char *buf; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + lock_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int lock_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("lock_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + lock_torture_stats_print(); + torture_shutdown_absorb("lock_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_stats"); + return 0; +} + +static inline void +lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, + const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealwriters_stress, stat_interval, verbose, + shuffle_interval, stutter, shutdown_secs, + onoff_interval, onoff_holdoff); +} + +static void lock_torture_cleanup(void) +{ + int i; + + if (torture_cleanup()) + return; + + if (writer_tasks) { + for (i = 0; i < nrealwriters_stress; i++) + torture_stop_kthread(lock_torture_writer, + writer_tasks[i]); + kfree(writer_tasks); + writer_tasks = NULL; + } + + torture_stop_kthread(lock_torture_stats, stats_task); + lock_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_lock_torture_errors)) + lock_torture_print_module_parms(cur_ops, + "End of test: FAILURE"); + else if (torture_onoff_failures()) + lock_torture_print_module_parms(cur_ops, + "End of test: LOCK_HOTPLUG"); + else + lock_torture_print_module_parms(cur_ops, + "End of test: SUCCESS"); +} + +static int __init lock_torture_init(void) +{ + int i; + int firsterr = 0; + static struct lock_torture_ops *torture_ops[] = { + &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + }; + + if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + return -EBUSY; + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("lock-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("lock-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + torture_init_end(); + return -EINVAL; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nwriters_stress >= 0) + nrealwriters_stress = nwriters_stress; + else + nrealwriters_stress = 2 * num_online_cpus(); + lock_torture_print_module_parms(cur_ops, "Start of test"); + + /* Initialize the statistics so that each run gets its own numbers. */ + + lock_is_write_held = 0; + lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); + if (lwsa == NULL) { + VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + lwsa[i].n_write_lock_fail = 0; + lwsa[i].n_write_lock_acquired = 0; + } + + /* Start up the kthreads. */ + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, + onoff_interval * HZ); + if (firsterr) + goto unwind; + } + if (shuffle_interval > 0) { + firsterr = torture_shuffle_init(shuffle_interval); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, + lock_torture_cleanup); + if (firsterr) + goto unwind; + } + if (stutter > 0) { + firsterr = torture_stutter_init(stutter); + if (firsterr) + goto unwind; + } + + writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + writer_tasks[i]); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(lock_torture_stats, NULL, + stats_task); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + lock_torture_cleanup(); + return firsterr; +} + +module_init(lock_torture_init); +module_exit(lock_torture_cleanup); diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 00000000000..be9ee1559fc --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,210 @@ + +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); + +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + + return per_cpu_ptr(&osq_node, cpu_nr); +} + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_queue *lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev) +{ + struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(smp_processor_id()); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; + + for (;;) { + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg(&lock->tail, curr, old) == curr) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + arch_mutex_cpu_relax(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(smp_processor_id()); + int old; + + node->locked = 0; + node->next = NULL; + node->cpu = curr; + + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) + return true; + + prev = decode_cpu(old); + node->prev = prev; + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + arch_mutex_cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + arch_mutex_cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node, *next; + int curr = encode_cpu(smp_processor_id()); + + /* + * Fast path for the uncontended case. + */ + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + return; + + /* + * Second most likely case. + */ + node = this_cpu_ptr(&osq_node); + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 00000000000..74356dc0ce2 --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,130 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include <asm/mcs_spinlock.h> + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ + return; + } + ACCESS_ONCE(prev->next) = node; + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); +} + +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # value */ +}; + +extern bool osq_lock(struct optimistic_spin_queue *lock); +extern void osq_unlock(struct optimistic_spin_queue *lock); + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index faf6f5b53e7..5cf6731b98e 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -71,18 +71,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, void debug_mutex_unlock(struct mutex *lock) { - if (unlikely(!debug_locks)) - return; + if (likely(debug_locks)) { + DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->magic != lock); + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); + mutex_clear_owner(lock); + } - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); + /* + * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug + * mutexes so that we can do it here after we've verified state. + */ + atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4dd6e4c219d..acca2c1a3c5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> +#include "mcs_spinlock.h" /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -33,6 +34,13 @@ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" # include <asm-generic/mutex-null.h> +/* + * Must be 0 for the debug case so we do not do the unlock outside of the + * wait_lock region. debug_mutex_unlock() will do the actual unlock in this + * case. + */ +# undef __mutex_slowpath_needs_to_unlock +# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" # include <asm/mutex.h> @@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->spin_mlock = NULL; + osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); @@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count); +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); /** * mutex_lock - acquire the mutex @@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock); * more or less simultaneously, the spinners need to acquire a MCS lock * first before spinning on the owner field. * - * We don't inline mspin_lock() so that perf can correctly account for the - * time spent in this lock function. */ -struct mspin_node { - struct mspin_node *next ; - int locked; /* 1 if lock acquired */ -}; -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) - -static noinline -void mspin_lock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; - return; - } - ACCESS_ONCE(prev->next) = node; - smp_wmb(); - /* Wait until the lock holder passes the lock down */ - while (!ACCESS_ONCE(node->locked)) - arch_mutex_cpu_relax(); -} - -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (cmpxchg(lock, node, NULL) == node) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - ACCESS_ONCE(next->locked) = 1; - smp_wmb(); -} /* * Mutex spinning code migrated from kernel/sched/core.c @@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + if (need_resched()) + return 0; + rcu_read_lock(); owner = ACCESS_ONCE(lock->owner); if (owner) @@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) } #endif -static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +__visible __used noinline +void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** * mutex_unlock - release the mutex @@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; + if (!osq_lock(&lock->osq)) + goto slowpath; + for (;;) { struct task_struct *owner; - struct mspin_node node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * performed the optimistic spinning cannot be done. */ if (ACCESS_ONCE(ww->ctx)) - goto slowpath; + break; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) { - mspin_unlock(MLOCK(lock), &node); - goto slowpath; - } + if (owner && !mutex_spin_on_owner(lock, owner)) + break; if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { @@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mspin_unlock(MLOCK(lock), &node); + osq_unlock(&lock->osq); preempt_enable(); return 0; } - mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - goto slowpath; + break; /* * The cpu_relax() call is a compiler barrier which forces @@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } + osq_unlock(&lock->osq); slowpath: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) + schedule_preempt_disabled(); #endif spin_lock_mutex(&lock->wait_lock, flags); @@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - /* * some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to @@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = @@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) /* * Release the lock, slowpath: */ -static __used noinline void +__visible void __mutex_unlock_slowpath(atomic_t *lock_count) { __mutex_unlock_common_slowpath(lock_count, 1); @@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); -static __used noinline void __sched +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 00000000000..fb5b8ac411a --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long <waiman.long@hp.com> + */ +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <asm/qrwlock.h> + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ + while ((cnts & _QW_WMASK) == _QW_LOCKED) { + arch_mutex_cpu_relax(); + cnts = smp_load_acquire((u32 *)&lock->cnts); + } +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* + * Readers come here when they cannot get the lock without waiting + */ + if (unlikely(in_interrupt())) { + /* + * Readers in interrupt context will spin until the lock is + * available without waiting in the queue. + */ + cnts = smp_load_acquire((u32 *)&lock->cnts); + rspin_until_writer_unlock(lock, cnts); + return; + } + atomic_sub(_QR_BIAS, &lock->cnts); + + /* + * Put the reader into the wait queue + */ + arch_spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state + * goes to 0 and then try to increment the reader count and get + * the lock. It is possible that an incoming writer may steal the + * lock in the interim, so it is necessary to check the writer byte + * to make sure that the write lock isn't taken. + */ + while (atomic_read(&lock->cnts) & _QW_WMASK) + arch_mutex_cpu_relax(); + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + rspin_until_writer_unlock(lock, cnts); + + /* + * Signal the next one in queue to become queue head + */ + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* Put the writer into the wait queue */ + arch_spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present */ + if (!atomic_read(&lock->cnts) && + (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + goto unlock; + + /* + * Set the waiting flag to notify readers that a writer is pending, + * or wait for a previous writer to go away. + */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if (!(cnts & _QW_WMASK) && + (atomic_cmpxchg(&lock->cnts, cnts, + cnts | _QW_WAITING) == cnts)) + break; + + arch_mutex_cpu_relax(); + } + + /* When no more readers, set the locked flag */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if ((cnts == _QW_WAITING) && + (atomic_cmpxchg(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) + break; + + arch_mutex_cpu_relax(); + } +unlock: + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d7..ab29b6a2266 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, { return (waiter != NULL); } + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + debug_rt_mutex_print_deadlock(w); +} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2e960a2bab8..fc605941b9b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) owner = *p; } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } + +/* + * Safe fastpath aware unlock: + * 1) Clear the waiters bit + * 2) Drop lock->wait_lock + * 3) Try to unlock the lock with cmpxchg + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + + clear_rt_mutex_waiters(lock); + raw_spin_unlock(&lock->wait_lock); + /* + * If a new waiter comes in between the unlock and the cmpxchg + * we have two situations: + * + * unlock(wait_lock); + * lock(wait_lock); + * cmpxchg(p, owner, 0) == owner + * mark_rt_mutex_waiters(lock); + * acquire(lock); + * or: + * + * unlock(wait_lock); + * lock(wait_lock); + * mark_rt_mutex_waiters(lock); + * + * cmpxchg(p, owner, 0) != owner + * enqueue_waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * wake waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * acquire(lock); + */ + return rt_mutex_cmpxchg(lock, owner, NULL); +} + #else # define rt_mutex_cmpxchg(l,c,n) (0) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) @@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); } + +/* + * Simple slow path only version: lock->owner is protected by lock->wait_lock. + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return true; +} #endif static inline int @@ -213,6 +265,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) } /* + * Called by sched_setscheduler() to check whether the priority change + * is overruled by a possible priority boosting. + */ +int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ + if (!task_has_pi_waiters(task)) + return 0; + + return task_top_pi_waiter(task)->task->prio <= newprio; +} + +/* * Adjust the priority of a task, after its pi_waiters got modified. * * This can be both boosting and unboosting. task->pi_lock must be held. @@ -248,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task) */ int max_lock_depth = 1024; +static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +{ + return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; +} + /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. * - * @task: the task owning the mutex (owner) for which a chain walk is probably - * needed + * @task: the task owning the mutex (owner) for which a chain walk is + * probably needed * @deadlock_detect: do we have to carry out deadlock detection? - * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck - * things for a task that has just got its priority adjusted, and - * is waiting on a mutex) + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @next_lock: the mutex on which the owner of @orig_lock was blocked before + * we dropped its pi_lock. Is never dereferenced, only used for + * comparison to detect lock chain changes. * @orig_waiter: rt_mutex_waiter struct for the task that has just donated - * its priority to the mutex owner (can be NULL in the case - * depicted above or if the top waiter is gone away and we are - * actually deboosting the owner) - * @top_task: the current top waiter + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int deadlock_detect, struct rt_mutex *orig_lock, + struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { @@ -302,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } put_task_struct(task); - return deadlock_detect ? -EDEADLK : 0; + return -EDEADLK; } retry: /* @@ -327,13 +400,32 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* + * We dropped all locks after taking a refcount on @task, so + * the task might have moved on in the lock chain or even left + * the chain completely and blocks now on an unrelated lock or + * on @orig_lock. + * + * We stored the lock on which @task was blocked in @next_lock, + * so we can detect the chain change. + */ + if (next_lock != waiter->lock) + goto out_unlock_pi; + + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; + if (top_waiter) { + if (!task_has_pi_waiters(task)) + goto out_unlock_pi; + /* + * If deadlock detection is off, we stop here if we + * are not the top pi waiter of the task. + */ + if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) + goto out_unlock_pi; + } /* * When deadlock detection is off then we check, if further @@ -349,11 +441,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto retry; } - /* Deadlock detection */ + /* + * Deadlock detection. If the lock is the same as the original + * lock which caused us to walk the lock chain or if the + * current lock is owned by the task which initiated the chain + * walk, we detected a deadlock. + */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; + ret = -EDEADLK; goto out_unlock_pi; } @@ -398,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } + /* + * Check whether the task which owns the current lock is pi + * blocked itself. If yes we store a pointer to the lock for + * the lock chain change detection above. After we dropped + * task->pi_lock next_lock cannot be dereferenced anymore. + */ + next_lock = task_blocked_on_lock(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); + /* + * We reached the end of the lock chain. Stop right here. No + * point to go back just to figure that out. + */ + if (!next_lock) + goto out_put_task; + if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -512,8 +624,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; + struct rt_mutex *next_lock; int chain_walk = 0, res; + unsigned long flags; + + /* + * Early deadlock detection. We really don't want the task to + * enqueue on itself just to untangle the mess later. It's not + * only an optimization. We drop the locks, so another waiter + * can come in before the chain walk detects the deadlock. So + * the other will detect the deadlock and return -EDEADLOCK, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ + if (owner == task) + return -EDEADLK; raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); @@ -533,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (!owner) return 0; + raw_spin_lock_irqsave(&owner->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { chain_walk = 1; + } + + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - if (!chain_walk) + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Even if full deadlock detection is on, if the owner is not + * blocked itself, we can avoid finding this out in the chain + * walk. + */ + if (!chain_walk || !next_lock) return 0; /* @@ -558,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -569,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and wake it up. + * Remove the top waiter from the current tasks pi waiter list and + * wake it up. * * Called with lock->wait_lock held. */ @@ -590,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock) */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_set_owner(lock, NULL); + /* + * As we are waking up the top waiter, and the waiter stays + * queued on the lock until it gets the lock, this lock + * obviously has waiters. Just set the bit here and this has + * the added benefit of forcing all new tasks into the + * slow path making sure no task of lower priority than + * the top waiter can steal this lock. + */ + lock->owner = (void *) RT_MUTEX_HAS_WAITERS; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + /* + * It's safe to dereference waiter as it cannot go away as + * long as we hold lock->wait_lock. The waiter task needs to + * acquire it in order to dequeue the waiter. + */ wake_up_process(waiter->task); } @@ -608,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock, { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex *next_lock = NULL; unsigned long flags; - int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); rt_mutex_dequeue(lock, waiter); @@ -633,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!chain_walk) + if (!next_lock) return; /* gets dropped in rt_mutex_adjust_prio_chain()! */ @@ -647,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -660,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock, void rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -670,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - + next_lock = waiter->lock; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + + rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); } /** @@ -727,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) +{ + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + /* + * Yell lowdly and stop the task right here. + */ + rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + /* * Slow path lock function: */ @@ -766,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(TASK_RUNNING); - if (unlikely(ret)) + if (unlikely(ret)) { remove_waiter(lock, &waiter); + rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + } /* * try_to_take_rt_mutex() sets the waiter bit @@ -823,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_deadlock_account_unlock(current); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here + * because of: + * + * foo->lock->owner = NULL; + * rtmutex_lock(foo->lock); <- fast path + * free = atomic_dec_and_test(foo->refcnt); + * rtmutex_unlock(foo->lock); <- fast path + * if (free) + * kfree(foo); + * raw_spin_unlock(foo->lock->wait_lock); + * + * So for the fastpath enabled kernel: + * + * Nothing can set the waiters bit as long as we hold + * lock->wait_lock. So we do the following sequence: + * + * owner = rt_mutex_owner(lock); + * clear_rt_mutex_waiters(lock); + * raw_spin_unlock(&lock->wait_lock); + * if (cmpxchg(&lock->owner, owner, 0) == owner) + * return; + * goto retry; + * + * The fastpath disabled variant is simple as all access to + * lock->owner is serialized by lock->wait_lock: + * + * lock->owner = NULL; + * raw_spin_unlock(&lock->wait_lock); + */ + while (!rt_mutex_has_waiters(lock)) { + /* Drops lock->wait_lock ! */ + if (unlock_rt_mutex_safe(lock) == true) + return; + /* Relock the rtmutex and try again */ + raw_spin_lock(&lock->wait_lock); } + /* + * The wakeup next waiter path does not suffer from the above + * race. See the comments there. + */ wakeup_next_waiter(lock); raw_spin_unlock(&lock->wait_lock); @@ -1076,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); if (ret && !rt_mutex_owner(lock)) { /* diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421..f6a1f3c133b 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -24,3 +24,8 @@ #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + WARN(1, "rtmutex deadlock detected\n"); +} diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 9be8a914497..2c93571162c 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem) unsigned long flags; if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); + ret = (sem->count != 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; @@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->activity = 0; + sem->count = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } @@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(next, struct rwsem_waiter, list); } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - sem->activity += woken; + sem->count += woken; out: return sem; @@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; ret = 1; } @@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) * itself into sleep and waiting for system woke it or someone * else in the head of the wait list up. */ - if (sem->activity == 0) + if (sem->count == 0) break; set_task_state(tsk, TASK_UNINTERRUPTIBLE); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ - sem->activity = -1; + sem->count = -1; list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity == 0) { + if (sem->count == 0) { /* got the lock */ - sem->activity = -1; + sem->count = -1; ret = 1; } @@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + if (--sem->count == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 0; + sem->count = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); @@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 1; + sem->count = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 19c5fa95e0b..a2391ac135c 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,11 +5,66 @@ * * Writer lock-stealing by Alex Shi <alex.shi@intel.com> * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. */ #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/sched/rt.h> + +#include "mcs_spinlock.h" + +/* + * Guide to the rw_semaphore's count field for common values. + * (32-bit case illustrated, similar for 64-bit) + * + * 0x0000000X (1) X readers active or attempting lock, no writer waiting + * X = #active_readers + #readers attempting to lock + * (X*ACTIVE_BIAS) + * + * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or + * attempting to read lock or write lock. + * + * 0xffff000X (1) X readers active or attempting lock, with waiters for lock + * X = #active readers + # readers attempting lock + * (X*ACTIVE_BIAS + WAITING_BIAS) + * (2) 1 writer attempting lock, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * (3) 1 writer active, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * + * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock + * (WAITING_BIAS + ACTIVE_BIAS) + * (2) 1 writer active or attempting lock, no waiters for lock + * (ACTIVE_WRITE_BIAS) + * + * 0xffff0000 (1) There are writers or readers queued but none active + * or in the process of attempting lock. + * (WAITING_BIAS) + * Note: writer can attempt to steal lock for this count by adding + * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * + * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. + * (ACTIVE_WRITE_BIAS + WAITING_BIAS) + * + * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking + * the count becomes more than 0 for successful lock acquisition, + * i.e. the case where there are only readers or nobody has lock. + * (1st and 2nd case above). + * + * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and + * checking the count becomes ACTIVE_WRITE_BIAS for successful lock + * acquisition (i.e. nobody else has lock or attempts lock). If + * unsuccessful, in rwsem_down_write_failed, we'll check to see if there + * are only waiters but none active (5th case above), and attempt to + * steal the lock. + * + */ /* * Initialize an rwsem: @@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + sem->owner = NULL; + osq_lock_init(&sem->osq); +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -141,8 +200,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ +__visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; @@ -187,63 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = false; + + if (need_resched()) + return false; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, yet we have just recently entered the + * slowpath, then there is a possibility reader(s) may have the lock. + * To be safe, avoid spinning in these situations. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock + */ +__visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); + + /* + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } @@ -252,6 +470,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ +__visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; @@ -272,6 +491,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ +__visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdf..e2d3bc7f03b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include <linux/atomic.h> +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); |
