diff options
Diffstat (limited to 'kernel/locking')
27 files changed, 11613 insertions, 0 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile new file mode 100644 index 00000000000..8541bfdfd23 --- /dev/null +++ b/kernel/locking/Makefile @@ -0,0 +1,28 @@ + +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +endif + +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o +obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o +obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c new file mode 100644 index 00000000000..86ae2aebf00 --- /dev/null +++ b/kernel/locking/lglock.c @@ -0,0 +1,89 @@ +/* See include/linux/lglock.h for description */ +#include <linux/module.h> +#include <linux/lglock.h> +#include <linux/cpu.h> +#include <linux/string.h> + +/* + * Note there is no uninit, so lglocks cannot be defined in + * modules (but it's fine to use them from there) + * Could be added though, just undo lg_lock_init + */ + +void lg_lock_init(struct lglock *lg, char *name) +{ +	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); +} +EXPORT_SYMBOL(lg_lock_init); + +void lg_local_lock(struct lglock *lg) +{ +	arch_spinlock_t *lock; + +	preempt_disable(); +	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); +	lock = this_cpu_ptr(lg->lock); +	arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock); + +void lg_local_unlock(struct lglock *lg) +{ +	arch_spinlock_t *lock; + +	lock_release(&lg->lock_dep_map, 1, _RET_IP_); +	lock = this_cpu_ptr(lg->lock); +	arch_spin_unlock(lock); +	preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock); + +void lg_local_lock_cpu(struct lglock *lg, int cpu) +{ +	arch_spinlock_t *lock; + +	preempt_disable(); +	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); +	lock = per_cpu_ptr(lg->lock, cpu); +	arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock_cpu); + +void lg_local_unlock_cpu(struct lglock *lg, int cpu) +{ +	arch_spinlock_t *lock; + +	lock_release(&lg->lock_dep_map, 1, _RET_IP_); +	lock = per_cpu_ptr(lg->lock, cpu); +	arch_spin_unlock(lock); +	preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock_cpu); + +void lg_global_lock(struct lglock *lg) +{ +	int i; + +	preempt_disable(); +	lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); +	for_each_possible_cpu(i) { +		arch_spinlock_t *lock; +		lock = per_cpu_ptr(lg->lock, i); +		arch_spin_lock(lock); +	} +} +EXPORT_SYMBOL(lg_global_lock); + +void lg_global_unlock(struct lglock *lg) +{ +	int i; + +	lock_release(&lg->lock_dep_map, 1, _RET_IP_); +	for_each_possible_cpu(i) { +		arch_spinlock_t *lock; +		lock = per_cpu_ptr(lg->lock, i); +		arch_spin_unlock(lock); +	} +	preempt_enable(); +} +EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c new file mode 100644 index 00000000000..d24e4339b46 --- /dev/null +++ b/kernel/locking/lockdep.c @@ -0,0 +1,4258 @@ +/* + * kernel/lockdep.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * this code maps all the lock dependencies as they occur in a live kernel + * and will warn about the following classes of locking bugs: + * + * - lock inversion scenarios + * - circular lock dependencies + * - hardirq/softirq safe/unsafe locking bugs + * + * Bugs are reported even if the current locking scenario does not cause + * any deadlock at this point. + * + * I.e. if anytime in the past two locks were taken in a different order, + * even if it happened for another task, even if those were different + * locks (but of the same class as this lock), this code will detect it. + * + * Thanks to Arjan van de Ven for coming up with the initial idea of + * mapping lock dependencies runtime. + */ +#define DISABLE_BRANCH_PROFILING +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/interrupt.h> +#include <linux/stacktrace.h> +#include <linux/debug_locks.h> +#include <linux/irqflags.h> +#include <linux/utsname.h> +#include <linux/hash.h> +#include <linux/ftrace.h> +#include <linux/stringify.h> +#include <linux/bitops.h> +#include <linux/gfp.h> +#include <linux/kmemcheck.h> + +#include <asm/sections.h> + +#include "lockdep_internals.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/lock.h> + +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + +/* + * lockdep_lock: protects the lockdep graph, the hashes and the + *               class/list/hash allocators. + * + * This is one of the rare exceptions where it's justified + * to use a raw spinlock - we really dont want the spinlock + * code to recurse back into the lockdep code... + */ +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static int graph_lock(void) +{ +	arch_spin_lock(&lockdep_lock); +	/* +	 * Make sure that if another CPU detected a bug while +	 * walking the graph we dont change it (while the other +	 * CPU is busy printing out stuff with the graph lock +	 * dropped already) +	 */ +	if (!debug_locks) { +		arch_spin_unlock(&lockdep_lock); +		return 0; +	} +	/* prevent any recursions within lockdep from causing deadlocks */ +	current->lockdep_recursion++; +	return 1; +} + +static inline int graph_unlock(void) +{ +	if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { +		/* +		 * The lockdep graph lock isn't locked while we expect it to +		 * be, we're confused now, bye! +		 */ +		return DEBUG_LOCKS_WARN_ON(1); +	} + +	current->lockdep_recursion--; +	arch_spin_unlock(&lockdep_lock); +	return 0; +} + +/* + * Turn lock debugging off and return with 0 if it was off already, + * and also release the graph lock: + */ +static inline int debug_locks_off_graph_unlock(void) +{ +	int ret = debug_locks_off(); + +	arch_spin_unlock(&lockdep_lock); + +	return ret; +} + +static int lockdep_initialized; + +unsigned long nr_list_entries; +static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; + +/* + * All data structures here are protected by the global debug_lock. + * + * Mutex key structs only get allocated, once during bootup, and never + * get freed - this significantly simplifies the debugging code. + */ +unsigned long nr_lock_classes; +static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; + +static inline struct lock_class *hlock_class(struct held_lock *hlock) +{ +	if (!hlock->class_idx) { +		/* +		 * Someone passed in garbage, we give up. +		 */ +		DEBUG_LOCKS_WARN_ON(1); +		return NULL; +	} +	return lock_classes + hlock->class_idx - 1; +} + +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], +		      cpu_lock_stats); + +static inline u64 lockstat_clock(void) +{ +	return local_clock(); +} + +static int lock_point(unsigned long points[], unsigned long ip) +{ +	int i; + +	for (i = 0; i < LOCKSTAT_POINTS; i++) { +		if (points[i] == 0) { +			points[i] = ip; +			break; +		} +		if (points[i] == ip) +			break; +	} + +	return i; +} + +static void lock_time_inc(struct lock_time *lt, u64 time) +{ +	if (time > lt->max) +		lt->max = time; + +	if (time < lt->min || !lt->nr) +		lt->min = time; + +	lt->total += time; +	lt->nr++; +} + +static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) +{ +	if (!src->nr) +		return; + +	if (src->max > dst->max) +		dst->max = src->max; + +	if (src->min < dst->min || !dst->nr) +		dst->min = src->min; + +	dst->total += src->total; +	dst->nr += src->nr; +} + +struct lock_class_stats lock_stats(struct lock_class *class) +{ +	struct lock_class_stats stats; +	int cpu, i; + +	memset(&stats, 0, sizeof(struct lock_class_stats)); +	for_each_possible_cpu(cpu) { +		struct lock_class_stats *pcs = +			&per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; + +		for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) +			stats.contention_point[i] += pcs->contention_point[i]; + +		for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) +			stats.contending_point[i] += pcs->contending_point[i]; + +		lock_time_add(&pcs->read_waittime, &stats.read_waittime); +		lock_time_add(&pcs->write_waittime, &stats.write_waittime); + +		lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); +		lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + +		for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) +			stats.bounces[i] += pcs->bounces[i]; +	} + +	return stats; +} + +void clear_lock_stats(struct lock_class *class) +{ +	int cpu; + +	for_each_possible_cpu(cpu) { +		struct lock_class_stats *cpu_stats = +			&per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; + +		memset(cpu_stats, 0, sizeof(struct lock_class_stats)); +	} +	memset(class->contention_point, 0, sizeof(class->contention_point)); +	memset(class->contending_point, 0, sizeof(class->contending_point)); +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ +	return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ +	put_cpu_var(cpu_lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ +	struct lock_class_stats *stats; +	u64 holdtime; + +	if (!lock_stat) +		return; + +	holdtime = lockstat_clock() - hlock->holdtime_stamp; + +	stats = get_lock_stats(hlock_class(hlock)); +	if (hlock->read) +		lock_time_inc(&stats->read_holdtime, holdtime); +	else +		lock_time_inc(&stats->write_holdtime, holdtime); +	put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + +/* + * We keep a global list of all lock classes. The list only grows, + * never shrinks. The list is only accessed with the lockdep + * spinlock lock held. + */ +LIST_HEAD(all_lock_classes); + +/* + * The lockdep classes are in a hash-table as well, for fast lookup: + */ +#define CLASSHASH_BITS		(MAX_LOCKDEP_KEYS_BITS - 1) +#define CLASSHASH_SIZE		(1UL << CLASSHASH_BITS) +#define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS) +#define classhashentry(key)	(classhash_table + __classhashfn((key))) + +static struct list_head classhash_table[CLASSHASH_SIZE]; + +/* + * We put the lock dependency chains into a hash-table as well, to cache + * their existence: + */ +#define CHAINHASH_BITS		(MAX_LOCKDEP_CHAINS_BITS-1) +#define CHAINHASH_SIZE		(1UL << CHAINHASH_BITS) +#define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS) +#define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain))) + +static struct list_head chainhash_table[CHAINHASH_SIZE]; + +/* + * The hash key of the lock dependency chains is a hash itself too: + * it's a hash of all locks taken up to that lock, including that lock. + * It's a 64-bit hash, because it's important for the keys to be + * unique. + */ +#define iterate_chain_key(key1, key2) \ +	(((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ +	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ +	(key2)) + +void lockdep_off(void) +{ +	current->lockdep_recursion++; +} +EXPORT_SYMBOL(lockdep_off); + +void lockdep_on(void) +{ +	current->lockdep_recursion--; +} +EXPORT_SYMBOL(lockdep_on); + +/* + * Debugging switches: + */ + +#define VERBOSE			0 +#define VERY_VERBOSE		0 + +#if VERBOSE +# define HARDIRQ_VERBOSE	1 +# define SOFTIRQ_VERBOSE	1 +# define RECLAIM_VERBOSE	1 +#else +# define HARDIRQ_VERBOSE	0 +# define SOFTIRQ_VERBOSE	0 +# define RECLAIM_VERBOSE	0 +#endif + +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +/* + * Quick filtering for interesting events: + */ +static int class_filter(struct lock_class *class) +{ +#if 0 +	/* Example */ +	if (class->name_version == 1 && +			!strcmp(class->name, "lockname")) +		return 1; +	if (class->name_version == 1 && +			!strcmp(class->name, "&struct->lockfield")) +		return 1; +#endif +	/* Filter everything else. 1 would be to allow everything else */ +	return 0; +} +#endif + +static int verbose(struct lock_class *class) +{ +#if VERBOSE +	return class_filter(class); +#endif +	return 0; +} + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +unsigned long nr_stack_trace_entries; +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + +static void print_lockdep_off(const char *bug_msg) +{ +	printk(KERN_DEBUG "%s\n", bug_msg); +	printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +	printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} + +static int save_trace(struct stack_trace *trace) +{ +	trace->nr_entries = 0; +	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; +	trace->entries = stack_trace + nr_stack_trace_entries; + +	trace->skip = 3; + +	save_stack_trace(trace); + +	/* +	 * Some daft arches put -1 at the end to indicate its a full trace. +	 * +	 * <rant> this is buggy anyway, since it takes a whole extra entry so a +	 * complete trace that maxes out the entries provided will be reported +	 * as incomplete, friggin useless </rant> +	 */ +	if (trace->nr_entries != 0 && +	    trace->entries[trace->nr_entries-1] == ULONG_MAX) +		trace->nr_entries--; + +	trace->max_entries = trace->nr_entries; + +	nr_stack_trace_entries += trace->nr_entries; + +	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { +		if (!debug_locks_off_graph_unlock()) +			return 0; + +		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); +		dump_stack(); + +		return 0; +	} + +	return 1; +} + +unsigned int nr_hardirq_chains; +unsigned int nr_softirq_chains; +unsigned int nr_process_chains; +unsigned int max_lockdep_depth; + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * We cannot printk in early bootup code. Not even early_printk() + * might work. So we mark any initialization errors and printk + * about it later on, in lockdep_info(). + */ +static int lockdep_init_error; +static const char *lock_init_error; +static unsigned long lockdep_init_trace_data[20]; +static struct stack_trace lockdep_init_trace = { +	.max_entries = ARRAY_SIZE(lockdep_init_trace_data), +	.entries = lockdep_init_trace_data, +}; + +/* + * Various lockdep statistics: + */ +DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +#endif + +/* + * Locking printouts: + */ + +#define __USAGE(__STATE)						\ +	[LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W",	\ +	[LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W",		\ +	[LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ +	[LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", + +static const char *usage_str[] = +{ +#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) +#include "lockdep_states.h" +#undef LOCKDEP_STATE +	[LOCK_USED] = "INITIAL USE", +}; + +const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +{ +	return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); +} + +static inline unsigned long lock_flag(enum lock_usage_bit bit) +{ +	return 1UL << bit; +} + +static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) +{ +	char c = '.'; + +	if (class->usage_mask & lock_flag(bit + 2)) +		c = '+'; +	if (class->usage_mask & lock_flag(bit)) { +		c = '-'; +		if (class->usage_mask & lock_flag(bit + 2)) +			c = '?'; +	} + +	return c; +} + +void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) +{ +	int i = 0; + +#define LOCKDEP_STATE(__STATE) 						\ +	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE);	\ +	usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); +#include "lockdep_states.h" +#undef LOCKDEP_STATE + +	usage[i] = '\0'; +} + +static void __print_lock_name(struct lock_class *class) +{ +	char str[KSYM_NAME_LEN]; +	const char *name; + +	name = class->name; +	if (!name) { +		name = __get_key_name(class->key, str); +		printk("%s", name); +	} else { +		printk("%s", name); +		if (class->name_version > 1) +			printk("#%d", class->name_version); +		if (class->subclass) +			printk("/%d", class->subclass); +	} +} + +static void print_lock_name(struct lock_class *class) +{ +	char usage[LOCK_USAGE_CHARS]; + +	get_usage_chars(class, usage); + +	printk(" ("); +	__print_lock_name(class); +	printk("){%s}", usage); +} + +static void print_lockdep_cache(struct lockdep_map *lock) +{ +	const char *name; +	char str[KSYM_NAME_LEN]; + +	name = lock->name; +	if (!name) +		name = __get_key_name(lock->key->subkeys, str); + +	printk("%s", name); +} + +static void print_lock(struct held_lock *hlock) +{ +	print_lock_name(hlock_class(hlock)); +	printk(", at: "); +	print_ip_sym(hlock->acquire_ip); +} + +static void lockdep_print_held_locks(struct task_struct *curr) +{ +	int i, depth = curr->lockdep_depth; + +	if (!depth) { +		printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); +		return; +	} +	printk("%d lock%s held by %s/%d:\n", +		depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); + +	for (i = 0; i < depth; i++) { +		printk(" #%d: ", i); +		print_lock(curr->held_locks + i); +	} +} + +static void print_kernel_ident(void) +{ +	printk("%s %.*s %s\n", init_utsname()->release, +		(int)strcspn(init_utsname()->version, " "), +		init_utsname()->version, +		print_tainted()); +} + +static int very_verbose(struct lock_class *class) +{ +#if VERY_VERBOSE +	return class_filter(class); +#endif +	return 0; +} + +/* + * Is this the address of a static object: + */ +#ifdef __KERNEL__ +static int static_obj(void *obj) +{ +	unsigned long start = (unsigned long) &_stext, +		      end   = (unsigned long) &_end, +		      addr  = (unsigned long) obj; + +	/* +	 * static variable? +	 */ +	if ((addr >= start) && (addr < end)) +		return 1; + +	if (arch_is_kernel_data(addr)) +		return 1; + +	/* +	 * in-kernel percpu var? +	 */ +	if (is_kernel_percpu_address(addr)) +		return 1; + +	/* +	 * module static or percpu var? +	 */ +	return is_module_address(addr) || is_module_percpu_address(addr); +} +#endif + +/* + * To make lock name printouts unique, we calculate a unique + * class->name_version generation counter: + */ +static int count_matching_names(struct lock_class *new_class) +{ +	struct lock_class *class; +	int count = 0; + +	if (!new_class->name) +		return 0; + +	list_for_each_entry(class, &all_lock_classes, lock_entry) { +		if (new_class->key - new_class->subclass == class->key) +			return class->name_version; +		if (class->name && !strcmp(class->name, new_class->name)) +			count = max(count, class->name_version); +	} + +	return count + 1; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ +	struct lockdep_subclass_key *key; +	struct list_head *hash_head; +	struct lock_class *class; + +#ifdef CONFIG_DEBUG_LOCKDEP +	/* +	 * If the architecture calls into lockdep before initializing +	 * the hashes then we'll warn about it later. (we cannot printk +	 * right now) +	 */ +	if (unlikely(!lockdep_initialized)) { +		lockdep_init(); +		lockdep_init_error = 1; +		lock_init_error = lock->name; +		save_stack_trace(&lockdep_init_trace); +	} +#endif + +	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { +		debug_locks_off(); +		printk(KERN_ERR +			"BUG: looking up invalid subclass: %u\n", subclass); +		printk(KERN_ERR +			"turning off the locking correctness validator.\n"); +		dump_stack(); +		return NULL; +	} + +	/* +	 * Static locks do not have their class-keys yet - for them the key +	 * is the lock object itself: +	 */ +	if (unlikely(!lock->key)) +		lock->key = (void *)lock; + +	/* +	 * NOTE: the class-key must be unique. For dynamic locks, a static +	 * lock_class_key variable is passed in through the mutex_init() +	 * (or spin_lock_init()) call - which acts as the key. For static +	 * locks we use the lock object itself as the key. +	 */ +	BUILD_BUG_ON(sizeof(struct lock_class_key) > +			sizeof(struct lockdep_map)); + +	key = lock->key->subkeys + subclass; + +	hash_head = classhashentry(key); + +	/* +	 * We can walk the hash lockfree, because the hash only +	 * grows, and we are careful when adding entries to the end: +	 */ +	list_for_each_entry(class, hash_head, hash_entry) { +		if (class->key == key) { +			/* +			 * Huh! same key, different name? Did someone trample +			 * on some memory? We're most confused. +			 */ +			WARN_ON_ONCE(class->name != lock->name); +			return class; +		} +	} + +	return NULL; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) +{ +	struct lockdep_subclass_key *key; +	struct list_head *hash_head; +	struct lock_class *class; +	unsigned long flags; + +	class = look_up_lock_class(lock, subclass); +	if (likely(class)) +		goto out_set_class_cache; + +	/* +	 * Debug-check: all keys must be persistent! + 	 */ +	if (!static_obj(lock->key)) { +		debug_locks_off(); +		printk("INFO: trying to register non-static key.\n"); +		printk("the code is fine but needs lockdep annotation.\n"); +		printk("turning off the locking correctness validator.\n"); +		dump_stack(); + +		return NULL; +	} + +	key = lock->key->subkeys + subclass; +	hash_head = classhashentry(key); + +	raw_local_irq_save(flags); +	if (!graph_lock()) { +		raw_local_irq_restore(flags); +		return NULL; +	} +	/* +	 * We have to do the hash-walk again, to avoid races +	 * with another CPU: +	 */ +	list_for_each_entry(class, hash_head, hash_entry) +		if (class->key == key) +			goto out_unlock_set; +	/* +	 * Allocate a new key from the static array, and add it to +	 * the hash: +	 */ +	if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { +		if (!debug_locks_off_graph_unlock()) { +			raw_local_irq_restore(flags); +			return NULL; +		} +		raw_local_irq_restore(flags); + +		print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); +		dump_stack(); +		return NULL; +	} +	class = lock_classes + nr_lock_classes++; +	debug_atomic_inc(nr_unused_locks); +	class->key = key; +	class->name = lock->name; +	class->subclass = subclass; +	INIT_LIST_HEAD(&class->lock_entry); +	INIT_LIST_HEAD(&class->locks_before); +	INIT_LIST_HEAD(&class->locks_after); +	class->name_version = count_matching_names(class); +	/* +	 * We use RCU's safe list-add method to make +	 * parallel walking of the hash-list safe: +	 */ +	list_add_tail_rcu(&class->hash_entry, hash_head); +	/* +	 * Add it to the global list of classes: +	 */ +	list_add_tail_rcu(&class->lock_entry, &all_lock_classes); + +	if (verbose(class)) { +		graph_unlock(); +		raw_local_irq_restore(flags); + +		printk("\nnew class %p: %s", class->key, class->name); +		if (class->name_version > 1) +			printk("#%d", class->name_version); +		printk("\n"); +		dump_stack(); + +		raw_local_irq_save(flags); +		if (!graph_lock()) { +			raw_local_irq_restore(flags); +			return NULL; +		} +	} +out_unlock_set: +	graph_unlock(); +	raw_local_irq_restore(flags); + +out_set_class_cache: +	if (!subclass || force) +		lock->class_cache[0] = class; +	else if (subclass < NR_LOCKDEP_CACHING_CLASSES) +		lock->class_cache[subclass] = class; + +	/* +	 * Hash collision, did we smoke some? We found a class with a matching +	 * hash but the subclass -- which is hashed in -- didn't match. +	 */ +	if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) +		return NULL; + +	return class; +} + +#ifdef CONFIG_PROVE_LOCKING +/* + * Allocate a lockdep entry. (assumes the graph_lock held, returns + * with NULL on failure) + */ +static struct lock_list *alloc_list_entry(void) +{ +	if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { +		if (!debug_locks_off_graph_unlock()) +			return NULL; + +		print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); +		dump_stack(); +		return NULL; +	} +	return list_entries + nr_list_entries++; +} + +/* + * Add a new dependency to the head of the list: + */ +static int add_lock_to_list(struct lock_class *class, struct lock_class *this, +			    struct list_head *head, unsigned long ip, +			    int distance, struct stack_trace *trace) +{ +	struct lock_list *entry; +	/* +	 * Lock not present yet - get a new dependency struct and +	 * add it to the list: +	 */ +	entry = alloc_list_entry(); +	if (!entry) +		return 0; + +	entry->class = this; +	entry->distance = distance; +	entry->trace = *trace; +	/* +	 * Since we never remove from the dependency list, the list can +	 * be walked lockless by other CPUs, it's only allocation +	 * that must be protected by the spinlock. But this also means +	 * we must make new entries visible only once writes to the +	 * entry become visible - hence the RCU op: +	 */ +	list_add_tail_rcu(&entry->entry, head); + +	return 1; +} + +/* + * For good efficiency of modular, we use power of 2 + */ +#define MAX_CIRCULAR_QUEUE_SIZE		4096UL +#define CQ_MASK				(MAX_CIRCULAR_QUEUE_SIZE-1) + +/* + * The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + */ +struct circular_queue { +	unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; +	unsigned int  front, rear; +}; + +static struct circular_queue lock_cq; + +unsigned int max_bfs_queue_depth; + +static unsigned int lockdep_dependency_gen_id; + +static inline void __cq_init(struct circular_queue *cq) +{ +	cq->front = cq->rear = 0; +	lockdep_dependency_gen_id++; +} + +static inline int __cq_empty(struct circular_queue *cq) +{ +	return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ +	return ((cq->rear + 1) & CQ_MASK) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ +	if (__cq_full(cq)) +		return -1; + +	cq->element[cq->rear] = elem; +	cq->rear = (cq->rear + 1) & CQ_MASK; +	return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ +	if (__cq_empty(cq)) +		return -1; + +	*elem = cq->element[cq->front]; +	cq->front = (cq->front + 1) & CQ_MASK; +	return 0; +} + +static inline unsigned int  __cq_get_elem_count(struct circular_queue *cq) +{ +	return (cq->rear - cq->front) & CQ_MASK; +} + +static inline void mark_lock_accessed(struct lock_list *lock, +					struct lock_list *parent) +{ +	unsigned long nr; + +	nr = lock - list_entries; +	WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ +	lock->parent = parent; +	lock->class->dep_gen_id = lockdep_dependency_gen_id; +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ +	unsigned long nr; + +	nr = lock - list_entries; +	WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ +	return lock->class->dep_gen_id == lockdep_dependency_gen_id; +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ +	return child->parent; +} + +static inline int get_lock_depth(struct lock_list *child) +{ +	int depth = 0; +	struct lock_list *parent; + +	while ((parent = get_lock_parent(child))) { +		child = parent; +		depth++; +	} +	return depth; +} + +static int __bfs(struct lock_list *source_entry, +		 void *data, +		 int (*match)(struct lock_list *entry, void *data), +		 struct lock_list **target_entry, +		 int forward) +{ +	struct lock_list *entry; +	struct list_head *head; +	struct circular_queue *cq = &lock_cq; +	int ret = 1; + +	if (match(source_entry, data)) { +		*target_entry = source_entry; +		ret = 0; +		goto exit; +	} + +	if (forward) +		head = &source_entry->class->locks_after; +	else +		head = &source_entry->class->locks_before; + +	if (list_empty(head)) +		goto exit; + +	__cq_init(cq); +	__cq_enqueue(cq, (unsigned long)source_entry); + +	while (!__cq_empty(cq)) { +		struct lock_list *lock; + +		__cq_dequeue(cq, (unsigned long *)&lock); + +		if (!lock->class) { +			ret = -2; +			goto exit; +		} + +		if (forward) +			head = &lock->class->locks_after; +		else +			head = &lock->class->locks_before; + +		list_for_each_entry(entry, head, entry) { +			if (!lock_accessed(entry)) { +				unsigned int cq_depth; +				mark_lock_accessed(entry, lock); +				if (match(entry, data)) { +					*target_entry = entry; +					ret = 0; +					goto exit; +				} + +				if (__cq_enqueue(cq, (unsigned long)entry)) { +					ret = -1; +					goto exit; +				} +				cq_depth = __cq_get_elem_count(cq); +				if (max_bfs_queue_depth < cq_depth) +					max_bfs_queue_depth = cq_depth; +			} +		} +	} +exit: +	return ret; +} + +static inline int __bfs_forwards(struct lock_list *src_entry, +			void *data, +			int (*match)(struct lock_list *entry, void *data), +			struct lock_list **target_entry) +{ +	return __bfs(src_entry, data, match, target_entry, 1); + +} + +static inline int __bfs_backwards(struct lock_list *src_entry, +			void *data, +			int (*match)(struct lock_list *entry, void *data), +			struct lock_list **target_entry) +{ +	return __bfs(src_entry, data, match, target_entry, 0); + +} + +/* + * Recursive, forwards-direction lock-dependency checking, used for + * both noncyclic checking and for hardirq-unsafe/softirq-unsafe + * checking. + */ + +/* + * Print a dependency chain entry (this is only done when a deadlock + * has been detected): + */ +static noinline int +print_circular_bug_entry(struct lock_list *target, int depth) +{ +	if (debug_locks_silent) +		return 0; +	printk("\n-> #%u", depth); +	print_lock_name(target->class); +	printk(":\n"); +	print_stack_trace(&target->trace, 6); + +	return 0; +} + +static void +print_circular_lock_scenario(struct held_lock *src, +			     struct held_lock *tgt, +			     struct lock_list *prt) +{ +	struct lock_class *source = hlock_class(src); +	struct lock_class *target = hlock_class(tgt); +	struct lock_class *parent = prt->class; + +	/* +	 * A direct locking problem where unsafe_class lock is taken +	 * directly by safe_class lock, then all we need to show +	 * is the deadlock scenario, as it is obvious that the +	 * unsafe lock is taken under the safe lock. +	 * +	 * But if there is a chain instead, where the safe lock takes +	 * an intermediate lock (middle_class) where this lock is +	 * not the same as the safe lock, then the lock chain is +	 * used to describe the problem. Otherwise we would need +	 * to show a different CPU case for each link in the chain +	 * from the safe_class lock to the unsafe_class lock. +	 */ +	if (parent != source) { +		printk("Chain exists of:\n  "); +		__print_lock_name(source); +		printk(" --> "); +		__print_lock_name(parent); +		printk(" --> "); +		__print_lock_name(target); +		printk("\n\n"); +	} + +	printk(" Possible unsafe locking scenario:\n\n"); +	printk("       CPU0                    CPU1\n"); +	printk("       ----                    ----\n"); +	printk("  lock("); +	__print_lock_name(target); +	printk(");\n"); +	printk("                               lock("); +	__print_lock_name(parent); +	printk(");\n"); +	printk("                               lock("); +	__print_lock_name(target); +	printk(");\n"); +	printk("  lock("); +	__print_lock_name(source); +	printk(");\n"); +	printk("\n *** DEADLOCK ***\n\n"); +} + +/* + * When a circular dependency is detected, print the + * header first: + */ +static noinline int +print_circular_bug_header(struct lock_list *entry, unsigned int depth, +			struct held_lock *check_src, +			struct held_lock *check_tgt) +{ +	struct task_struct *curr = current; + +	if (debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("======================================================\n"); +	printk("[ INFO: possible circular locking dependency detected ]\n"); +	print_kernel_ident(); +	printk("-------------------------------------------------------\n"); +	printk("%s/%d is trying to acquire lock:\n", +		curr->comm, task_pid_nr(curr)); +	print_lock(check_src); +	printk("\nbut task is already holding lock:\n"); +	print_lock(check_tgt); +	printk("\nwhich lock already depends on the new lock.\n\n"); +	printk("\nthe existing dependency chain (in reverse order) is:\n"); + +	print_circular_bug_entry(entry, depth); + +	return 0; +} + +static inline int class_equal(struct lock_list *entry, void *data) +{ +	return entry->class == data; +} + +static noinline int print_circular_bug(struct lock_list *this, +				struct lock_list *target, +				struct held_lock *check_src, +				struct held_lock *check_tgt) +{ +	struct task_struct *curr = current; +	struct lock_list *parent; +	struct lock_list *first_parent; +	int depth; + +	if (!debug_locks_off_graph_unlock() || debug_locks_silent) +		return 0; + +	if (!save_trace(&this->trace)) +		return 0; + +	depth = get_lock_depth(target); + +	print_circular_bug_header(target, depth, check_src, check_tgt); + +	parent = get_lock_parent(target); +	first_parent = parent; + +	while (parent) { +		print_circular_bug_entry(parent, --depth); +		parent = get_lock_parent(parent); +	} + +	printk("\nother info that might help us debug this:\n\n"); +	print_circular_lock_scenario(check_src, check_tgt, +				     first_parent); + +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +static noinline int print_bfs_bug(int ret) +{ +	if (!debug_locks_off_graph_unlock()) +		return 0; + +	/* +	 * Breadth-first-search failed, graph got corrupted? +	 */ +	WARN(1, "lockdep bfs error:%d\n", ret); + +	return 0; +} + +static int noop_count(struct lock_list *entry, void *data) +{ +	(*(unsigned long *)data)++; +	return 0; +} + +static unsigned long __lockdep_count_forward_deps(struct lock_list *this) +{ +	unsigned long  count = 0; +	struct lock_list *uninitialized_var(target_entry); + +	__bfs_forwards(this, (void *)&count, noop_count, &target_entry); + +	return count; +} +unsigned long lockdep_count_forward_deps(struct lock_class *class) +{ +	unsigned long ret, flags; +	struct lock_list this; + +	this.parent = NULL; +	this.class = class; + +	local_irq_save(flags); +	arch_spin_lock(&lockdep_lock); +	ret = __lockdep_count_forward_deps(&this); +	arch_spin_unlock(&lockdep_lock); +	local_irq_restore(flags); + +	return ret; +} + +static unsigned long __lockdep_count_backward_deps(struct lock_list *this) +{ +	unsigned long  count = 0; +	struct lock_list *uninitialized_var(target_entry); + +	__bfs_backwards(this, (void *)&count, noop_count, &target_entry); + +	return count; +} + +unsigned long lockdep_count_backward_deps(struct lock_class *class) +{ +	unsigned long ret, flags; +	struct lock_list this; + +	this.parent = NULL; +	this.class = class; + +	local_irq_save(flags); +	arch_spin_lock(&lockdep_lock); +	ret = __lockdep_count_backward_deps(&this); +	arch_spin_unlock(&lockdep_lock); +	local_irq_restore(flags); + +	return ret; +} + +/* + * Prove that the dependency graph starting at <entry> can not + * lead to <target>. Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct lock_list *root, struct lock_class *target, +		struct lock_list **target_entry) +{ +	int result; + +	debug_atomic_inc(nr_cyclic_checks); + +	result = __bfs_forwards(root, target, class_equal, target_entry); + +	return result; +} + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + */ + +static inline int usage_match(struct lock_list *entry, void *bit) +{ +	return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +} + + + +/* + * Find a node in the forwards-direction dependency sub-graph starting + * at @root->class that matches @bit. + * + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. + * + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. + */ +static int +find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +			struct lock_list **target_entry) +{ +	int result; + +	debug_atomic_inc(nr_find_usage_forwards_checks); + +	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + +	return result; +} + +/* + * Find a node in the backwards-direction dependency sub-graph starting + * at @root->class that matches @bit. + * + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. + * + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. + */ +static int +find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +			struct lock_list **target_entry) +{ +	int result; + +	debug_atomic_inc(nr_find_usage_backwards_checks); + +	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + +	return result; +} + +static void print_lock_class_header(struct lock_class *class, int depth) +{ +	int bit; + +	printk("%*s->", depth, ""); +	print_lock_name(class); +	printk(" ops: %lu", class->ops); +	printk(" {\n"); + +	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { +		if (class->usage_mask & (1 << bit)) { +			int len = depth; + +			len += printk("%*s   %s", depth, "", usage_str[bit]); +			len += printk(" at:\n"); +			print_stack_trace(class->usage_traces + bit, len); +		} +	} +	printk("%*s }\n", depth, ""); + +	printk("%*s ... key      at: ",depth,""); +	print_ip_sym((unsigned long)class->key); +} + +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, +				struct lock_list *root) +{ +	struct lock_list *entry = leaf; +	int depth; + +	/*compute depth from generated tree by BFS*/ +	depth = get_lock_depth(leaf); + +	do { +		print_lock_class_header(entry->class, depth); +		printk("%*s ... acquired at:\n", depth, ""); +		print_stack_trace(&entry->trace, 2); +		printk("\n"); + +		if (depth == 0 && (entry != root)) { +			printk("lockdep:%s bad path found in chain graph\n", __func__); +			break; +		} + +		entry = get_lock_parent(entry); +		depth--; +	} while (entry && (depth >= 0)); + +	return; +} + +static void +print_irq_lock_scenario(struct lock_list *safe_entry, +			struct lock_list *unsafe_entry, +			struct lock_class *prev_class, +			struct lock_class *next_class) +{ +	struct lock_class *safe_class = safe_entry->class; +	struct lock_class *unsafe_class = unsafe_entry->class; +	struct lock_class *middle_class = prev_class; + +	if (middle_class == safe_class) +		middle_class = next_class; + +	/* +	 * A direct locking problem where unsafe_class lock is taken +	 * directly by safe_class lock, then all we need to show +	 * is the deadlock scenario, as it is obvious that the +	 * unsafe lock is taken under the safe lock. +	 * +	 * But if there is a chain instead, where the safe lock takes +	 * an intermediate lock (middle_class) where this lock is +	 * not the same as the safe lock, then the lock chain is +	 * used to describe the problem. Otherwise we would need +	 * to show a different CPU case for each link in the chain +	 * from the safe_class lock to the unsafe_class lock. +	 */ +	if (middle_class != unsafe_class) { +		printk("Chain exists of:\n  "); +		__print_lock_name(safe_class); +		printk(" --> "); +		__print_lock_name(middle_class); +		printk(" --> "); +		__print_lock_name(unsafe_class); +		printk("\n\n"); +	} + +	printk(" Possible interrupt unsafe locking scenario:\n\n"); +	printk("       CPU0                    CPU1\n"); +	printk("       ----                    ----\n"); +	printk("  lock("); +	__print_lock_name(unsafe_class); +	printk(");\n"); +	printk("                               local_irq_disable();\n"); +	printk("                               lock("); +	__print_lock_name(safe_class); +	printk(");\n"); +	printk("                               lock("); +	__print_lock_name(middle_class); +	printk(");\n"); +	printk("  <Interrupt>\n"); +	printk("    lock("); +	__print_lock_name(safe_class); +	printk(");\n"); +	printk("\n *** DEADLOCK ***\n\n"); +} + +static int +print_bad_irq_dependency(struct task_struct *curr, +			 struct lock_list *prev_root, +			 struct lock_list *next_root, +			 struct lock_list *backwards_entry, +			 struct lock_list *forwards_entry, +			 struct held_lock *prev, +			 struct held_lock *next, +			 enum lock_usage_bit bit1, +			 enum lock_usage_bit bit2, +			 const char *irqclass) +{ +	if (!debug_locks_off_graph_unlock() || debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("======================================================\n"); +	printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", +		irqclass, irqclass); +	print_kernel_ident(); +	printk("------------------------------------------------------\n"); +	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", +		curr->comm, task_pid_nr(curr), +		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, +		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, +		curr->hardirqs_enabled, +		curr->softirqs_enabled); +	print_lock(next); + +	printk("\nand this task is already holding:\n"); +	print_lock(prev); +	printk("which would create a new lock dependency:\n"); +	print_lock_name(hlock_class(prev)); +	printk(" ->"); +	print_lock_name(hlock_class(next)); +	printk("\n"); + +	printk("\nbut this new dependency connects a %s-irq-safe lock:\n", +		irqclass); +	print_lock_name(backwards_entry->class); +	printk("\n... which became %s-irq-safe at:\n", irqclass); + +	print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + +	printk("\nto a %s-irq-unsafe lock:\n", irqclass); +	print_lock_name(forwards_entry->class); +	printk("\n... which became %s-irq-unsafe at:\n", irqclass); +	printk("..."); + +	print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + +	printk("\nother info that might help us debug this:\n\n"); +	print_irq_lock_scenario(backwards_entry, forwards_entry, +				hlock_class(prev), hlock_class(next)); + +	lockdep_print_held_locks(curr); + +	printk("\nthe dependencies between %s-irq-safe lock", irqclass); +	printk(" and the holding lock:\n"); +	if (!save_trace(&prev_root->trace)) +		return 0; +	print_shortest_lock_dependencies(backwards_entry, prev_root); + +	printk("\nthe dependencies between the lock to be acquired"); +	printk(" and %s-irq-unsafe lock:\n", irqclass); +	if (!save_trace(&next_root->trace)) +		return 0; +	print_shortest_lock_dependencies(forwards_entry, next_root); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +static int +check_usage(struct task_struct *curr, struct held_lock *prev, +	    struct held_lock *next, enum lock_usage_bit bit_backwards, +	    enum lock_usage_bit bit_forwards, const char *irqclass) +{ +	int ret; +	struct lock_list this, that; +	struct lock_list *uninitialized_var(target_entry); +	struct lock_list *uninitialized_var(target_entry1); + +	this.parent = NULL; + +	this.class = hlock_class(prev); +	ret = find_usage_backwards(&this, bit_backwards, &target_entry); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (ret == 1) +		return ret; + +	that.parent = NULL; +	that.class = hlock_class(next); +	ret = find_usage_forwards(&that, bit_forwards, &target_entry1); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (ret == 1) +		return ret; + +	return print_bad_irq_dependency(curr, &this, &that, +			target_entry, target_entry1, +			prev, next, +			bit_backwards, bit_forwards, irqclass); +} + +static const char *state_names[] = { +#define LOCKDEP_STATE(__STATE) \ +	__stringify(__STATE), +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static const char *state_rnames[] = { +#define LOCKDEP_STATE(__STATE) \ +	__stringify(__STATE)"-READ", +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline const char *state_name(enum lock_usage_bit bit) +{ +	return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +} + +static int exclusive_bit(int new_bit) +{ +	/* +	 * USED_IN +	 * USED_IN_READ +	 * ENABLED +	 * ENABLED_READ +	 * +	 * bit 0 - write/read +	 * bit 1 - used_in/enabled +	 * bit 2+  state +	 */ + +	int state = new_bit & ~3; +	int dir = new_bit & 2; + +	/* +	 * keep state, bit flip the direction and strip read. +	 */ +	return state | (dir ^ 2); +} + +static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, +			   struct held_lock *next, enum lock_usage_bit bit) +{ +	/* +	 * Prove that the new dependency does not connect a hardirq-safe +	 * lock with a hardirq-unsafe lock - to achieve this we search +	 * the backwards-subgraph starting at <prev>, and the +	 * forwards-subgraph starting at <next>: +	 */ +	if (!check_usage(curr, prev, next, bit, +			   exclusive_bit(bit), state_name(bit))) +		return 0; + +	bit++; /* _READ */ + +	/* +	 * Prove that the new dependency does not connect a hardirq-safe-read +	 * lock with a hardirq-unsafe lock - to achieve this we search +	 * the backwards-subgraph starting at <prev>, and the +	 * forwards-subgraph starting at <next>: +	 */ +	if (!check_usage(curr, prev, next, bit, +			   exclusive_bit(bit), state_name(bit))) +		return 0; + +	return 1; +} + +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, +		struct held_lock *next) +{ +#define LOCKDEP_STATE(__STATE)						\ +	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\ +		return 0; +#include "lockdep_states.h" +#undef LOCKDEP_STATE + +	return 1; +} + +static void inc_chains(void) +{ +	if (current->hardirq_context) +		nr_hardirq_chains++; +	else { +		if (current->softirq_context) +			nr_softirq_chains++; +		else +			nr_process_chains++; +	} +} + +#else + +static inline int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, +		struct held_lock *next) +{ +	return 1; +} + +static inline void inc_chains(void) +{ +	nr_process_chains++; +} + +#endif + +static void +print_deadlock_scenario(struct held_lock *nxt, +			     struct held_lock *prv) +{ +	struct lock_class *next = hlock_class(nxt); +	struct lock_class *prev = hlock_class(prv); + +	printk(" Possible unsafe locking scenario:\n\n"); +	printk("       CPU0\n"); +	printk("       ----\n"); +	printk("  lock("); +	__print_lock_name(prev); +	printk(");\n"); +	printk("  lock("); +	__print_lock_name(next); +	printk(");\n"); +	printk("\n *** DEADLOCK ***\n\n"); +	printk(" May be due to missing lock nesting notation\n\n"); +} + +static int +print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, +		   struct held_lock *next) +{ +	if (!debug_locks_off_graph_unlock() || debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("=============================================\n"); +	printk("[ INFO: possible recursive locking detected ]\n"); +	print_kernel_ident(); +	printk("---------------------------------------------\n"); +	printk("%s/%d is trying to acquire lock:\n", +		curr->comm, task_pid_nr(curr)); +	print_lock(next); +	printk("\nbut task is already holding lock:\n"); +	print_lock(prev); + +	printk("\nother info that might help us debug this:\n"); +	print_deadlock_scenario(next, prev); +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +/* + * Check whether we are holding such a class already. + * + * (Note that this has to be done separately, because the graph cannot + * detect such classes of deadlocks.) + * + * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + */ +static int +check_deadlock(struct task_struct *curr, struct held_lock *next, +	       struct lockdep_map *next_instance, int read) +{ +	struct held_lock *prev; +	struct held_lock *nest = NULL; +	int i; + +	for (i = 0; i < curr->lockdep_depth; i++) { +		prev = curr->held_locks + i; + +		if (prev->instance == next->nest_lock) +			nest = prev; + +		if (hlock_class(prev) != hlock_class(next)) +			continue; + +		/* +		 * Allow read-after-read recursion of the same +		 * lock class (i.e. read_lock(lock)+read_lock(lock)): +		 */ +		if ((read == 2) && prev->read) +			return 2; + +		/* +		 * We're holding the nest_lock, which serializes this lock's +		 * nesting behaviour. +		 */ +		if (nest) +			return 2; + +		return print_deadlock_bug(curr, prev, next); +	} +	return 1; +} + +/* + * There was a chain-cache miss, and we are about to add a new dependency + * to a previous lock. We recursively validate the following rules: + * + *  - would the adding of the <prev> -> <next> dependency create a + *    circular dependency in the graph? [== circular deadlock] + * + *  - does the new prev->next dependency connect any hardirq-safe lock + *    (in the full backwards-subgraph starting at <prev>) with any + *    hardirq-unsafe lock (in the full forwards-subgraph starting at + *    <next>)? [== illegal lock inversion with hardirq contexts] + * + *  - does the new prev->next dependency connect any softirq-safe lock + *    (in the full backwards-subgraph starting at <prev>) with any + *    softirq-unsafe lock (in the full forwards-subgraph starting at + *    <next>)? [== illegal lock inversion with softirq contexts] + * + * any of these scenarios could lead to a deadlock. + * + * Then if all the validations pass, we add the forwards and backwards + * dependency. + */ +static int +check_prev_add(struct task_struct *curr, struct held_lock *prev, +	       struct held_lock *next, int distance, int trylock_loop) +{ +	struct lock_list *entry; +	int ret; +	struct lock_list this; +	struct lock_list *uninitialized_var(target_entry); +	/* +	 * Static variable, serialized by the graph_lock(). +	 * +	 * We use this static variable to save the stack trace in case +	 * we call into this function multiple times due to encountering +	 * trylocks in the held lock stack. +	 */ +	static struct stack_trace trace; + +	/* +	 * Prove that the new <prev> -> <next> dependency would not +	 * create a circular dependency in the graph. (We do this by +	 * forward-recursing into the graph starting at <next>, and +	 * checking whether we can reach <prev>.) +	 * +	 * We are using global variables to control the recursion, to +	 * keep the stackframe size of the recursive functions low: +	 */ +	this.class = hlock_class(next); +	this.parent = NULL; +	ret = check_noncircular(&this, hlock_class(prev), &target_entry); +	if (unlikely(!ret)) +		return print_circular_bug(&this, target_entry, next, prev); +	else if (unlikely(ret < 0)) +		return print_bfs_bug(ret); + +	if (!check_prev_add_irq(curr, prev, next)) +		return 0; + +	/* +	 * For recursive read-locks we do all the dependency checks, +	 * but we dont store read-triggered dependencies (only +	 * write-triggered dependencies). This ensures that only the +	 * write-side dependencies matter, and that if for example a +	 * write-lock never takes any other locks, then the reads are +	 * equivalent to a NOP. +	 */ +	if (next->read == 2 || prev->read == 2) +		return 1; +	/* +	 * Is the <prev> -> <next> dependency already present? +	 * +	 * (this may occur even though this is a new chain: consider +	 *  e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 +	 *  chains - the second one will be new, but L1 already has +	 *  L2 added to its dependency list, due to the first chain.) +	 */ +	list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { +		if (entry->class == hlock_class(next)) { +			if (distance == 1) +				entry->distance = 1; +			return 2; +		} +	} + +	if (!trylock_loop && !save_trace(&trace)) +		return 0; + +	/* +	 * Ok, all validations passed, add the new lock +	 * to the previous lock's dependency list: +	 */ +	ret = add_lock_to_list(hlock_class(prev), hlock_class(next), +			       &hlock_class(prev)->locks_after, +			       next->acquire_ip, distance, &trace); + +	if (!ret) +		return 0; + +	ret = add_lock_to_list(hlock_class(next), hlock_class(prev), +			       &hlock_class(next)->locks_before, +			       next->acquire_ip, distance, &trace); +	if (!ret) +		return 0; + +	/* +	 * Debugging printouts: +	 */ +	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { +		graph_unlock(); +		printk("\n new dependency: "); +		print_lock_name(hlock_class(prev)); +		printk(" => "); +		print_lock_name(hlock_class(next)); +		printk("\n"); +		dump_stack(); +		return graph_lock(); +	} +	return 1; +} + +/* + * Add the dependency to all directly-previous locks that are 'relevant'. + * The ones that are relevant are (in increasing distance from curr): + * all consecutive trylock entries and the final non-trylock entry - or + * the end of this context's lock-chain - whichever comes first. + */ +static int +check_prevs_add(struct task_struct *curr, struct held_lock *next) +{ +	int depth = curr->lockdep_depth; +	int trylock_loop = 0; +	struct held_lock *hlock; + +	/* +	 * Debugging checks. +	 * +	 * Depth must not be zero for a non-head lock: +	 */ +	if (!depth) +		goto out_bug; +	/* +	 * At least two relevant locks must exist for this +	 * to be a head: +	 */ +	if (curr->held_locks[depth].irq_context != +			curr->held_locks[depth-1].irq_context) +		goto out_bug; + +	for (;;) { +		int distance = curr->lockdep_depth - depth + 1; +		hlock = curr->held_locks + depth - 1; +		/* +		 * Only non-recursive-read entries get new dependencies +		 * added: +		 */ +		if (hlock->read != 2 && hlock->check) { +			if (!check_prev_add(curr, hlock, next, +						distance, trylock_loop)) +				return 0; +			/* +			 * Stop after the first non-trylock entry, +			 * as non-trylock entries have added their +			 * own direct dependencies already, so this +			 * lock is connected to them indirectly: +			 */ +			if (!hlock->trylock) +				break; +		} +		depth--; +		/* +		 * End of lock-stack? +		 */ +		if (!depth) +			break; +		/* +		 * Stop the search if we cross into another context: +		 */ +		if (curr->held_locks[depth].irq_context != +				curr->held_locks[depth-1].irq_context) +			break; +		trylock_loop = 1; +	} +	return 1; +out_bug: +	if (!debug_locks_off_graph_unlock()) +		return 0; + +	/* +	 * Clearly we all shouldn't be here, but since we made it we +	 * can reliable say we messed up our state. See the above two +	 * gotos for reasons why we could possibly end up here. +	 */ +	WARN_ON(1); + +	return 0; +} + +unsigned long nr_lock_chains; +struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +int nr_chain_hlocks; +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) +{ +	return lock_classes + chain_hlocks[chain->base + i]; +} + +/* + * Look up a dependency chain. If the key is not present yet then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache(struct task_struct *curr, +				     struct held_lock *hlock, +				     u64 chain_key) +{ +	struct lock_class *class = hlock_class(hlock); +	struct list_head *hash_head = chainhashentry(chain_key); +	struct lock_chain *chain; +	struct held_lock *hlock_curr; +	int i, j; + +	/* +	 * We might need to take the graph lock, ensure we've got IRQs +	 * disabled to make this an IRQ-safe lock.. for recursion reasons +	 * lockdep won't complain about its own locking errors. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return 0; +	/* +	 * We can walk it lock-free, because entries only get added +	 * to the hash: +	 */ +	list_for_each_entry(chain, hash_head, entry) { +		if (chain->chain_key == chain_key) { +cache_hit: +			debug_atomic_inc(chain_lookup_hits); +			if (very_verbose(class)) +				printk("\nhash chain already cached, key: " +					"%016Lx tail class: [%p] %s\n", +					(unsigned long long)chain_key, +					class->key, class->name); +			return 0; +		} +	} +	if (very_verbose(class)) +		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", +			(unsigned long long)chain_key, class->key, class->name); +	/* +	 * Allocate a new chain entry from the static array, and add +	 * it to the hash: +	 */ +	if (!graph_lock()) +		return 0; +	/* +	 * We have to walk the chain again locked - to avoid duplicates: +	 */ +	list_for_each_entry(chain, hash_head, entry) { +		if (chain->chain_key == chain_key) { +			graph_unlock(); +			goto cache_hit; +		} +	} +	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { +		if (!debug_locks_off_graph_unlock()) +			return 0; + +		print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); +		dump_stack(); +		return 0; +	} +	chain = lock_chains + nr_lock_chains++; +	chain->chain_key = chain_key; +	chain->irq_context = hlock->irq_context; +	/* Find the first held_lock of current chain */ +	for (i = curr->lockdep_depth - 1; i >= 0; i--) { +		hlock_curr = curr->held_locks + i; +		if (hlock_curr->irq_context != hlock->irq_context) +			break; +	} +	i++; +	chain->depth = curr->lockdep_depth + 1 - i; +	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { +		chain->base = nr_chain_hlocks; +		nr_chain_hlocks += chain->depth; +		for (j = 0; j < chain->depth - 1; j++, i++) { +			int lock_id = curr->held_locks[i].class_idx - 1; +			chain_hlocks[chain->base + j] = lock_id; +		} +		chain_hlocks[chain->base + j] = class - lock_classes; +	} +	list_add_tail_rcu(&chain->entry, hash_head); +	debug_atomic_inc(chain_lookup_misses); +	inc_chains(); + +	return 1; +} + +static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, +		struct held_lock *hlock, int chain_head, u64 chain_key) +{ +	/* +	 * Trylock needs to maintain the stack of held locks, but it +	 * does not add new dependencies, because trylock can be done +	 * in any order. +	 * +	 * We look up the chain_key and do the O(N^2) check and update of +	 * the dependencies only if this is a new dependency chain. +	 * (If lookup_chain_cache() returns with 1 it acquires +	 * graph_lock for us) +	 */ +	if (!hlock->trylock && hlock->check && +	    lookup_chain_cache(curr, hlock, chain_key)) { +		/* +		 * Check whether last held lock: +		 * +		 * - is irq-safe, if this lock is irq-unsafe +		 * - is softirq-safe, if this lock is hardirq-unsafe +		 * +		 * And check whether the new lock's dependency graph +		 * could lead back to the previous lock. +		 * +		 * any of these scenarios could lead to a deadlock. If +		 * All validations +		 */ +		int ret = check_deadlock(curr, hlock, lock, hlock->read); + +		if (!ret) +			return 0; +		/* +		 * Mark recursive read, as we jump over it when +		 * building dependencies (just like we jump over +		 * trylock entries): +		 */ +		if (ret == 2) +			hlock->read = 2; +		/* +		 * Add dependency only if this lock is not the head +		 * of the chain, and if it's not a secondary read-lock: +		 */ +		if (!chain_head && ret != 2) +			if (!check_prevs_add(curr, hlock)) +				return 0; +		graph_unlock(); +	} else +		/* after lookup_chain_cache(): */ +		if (unlikely(!debug_locks)) +			return 0; + +	return 1; +} +#else +static inline int validate_chain(struct task_struct *curr, +	       	struct lockdep_map *lock, struct held_lock *hlock, +		int chain_head, u64 chain_key) +{ +	return 1; +} +#endif + +/* + * We are building curr_chain_key incrementally, so double-check + * it from scratch, to make sure that it's done correctly: + */ +static void check_chain_key(struct task_struct *curr) +{ +#ifdef CONFIG_DEBUG_LOCKDEP +	struct held_lock *hlock, *prev_hlock = NULL; +	unsigned int i, id; +	u64 chain_key = 0; + +	for (i = 0; i < curr->lockdep_depth; i++) { +		hlock = curr->held_locks + i; +		if (chain_key != hlock->prev_chain_key) { +			debug_locks_off(); +			/* +			 * We got mighty confused, our chain keys don't match +			 * with what we expect, someone trample on our task state? +			 */ +			WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", +				curr->lockdep_depth, i, +				(unsigned long long)chain_key, +				(unsigned long long)hlock->prev_chain_key); +			return; +		} +		id = hlock->class_idx - 1; +		/* +		 * Whoops ran out of static storage again? +		 */ +		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) +			return; + +		if (prev_hlock && (prev_hlock->irq_context != +							hlock->irq_context)) +			chain_key = 0; +		chain_key = iterate_chain_key(chain_key, id); +		prev_hlock = hlock; +	} +	if (chain_key != curr->curr_chain_key) { +		debug_locks_off(); +		/* +		 * More smoking hash instead of calculating it, damn see these +		 * numbers float.. I bet that a pink elephant stepped on my memory. +		 */ +		WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", +			curr->lockdep_depth, i, +			(unsigned long long)chain_key, +			(unsigned long long)curr->curr_chain_key); +	} +#endif +} + +static void +print_usage_bug_scenario(struct held_lock *lock) +{ +	struct lock_class *class = hlock_class(lock); + +	printk(" Possible unsafe locking scenario:\n\n"); +	printk("       CPU0\n"); +	printk("       ----\n"); +	printk("  lock("); +	__print_lock_name(class); +	printk(");\n"); +	printk("  <Interrupt>\n"); +	printk("    lock("); +	__print_lock_name(class); +	printk(");\n"); +	printk("\n *** DEADLOCK ***\n\n"); +} + +static int +print_usage_bug(struct task_struct *curr, struct held_lock *this, +		enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +{ +	if (!debug_locks_off_graph_unlock() || debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("=================================\n"); +	printk("[ INFO: inconsistent lock state ]\n"); +	print_kernel_ident(); +	printk("---------------------------------\n"); + +	printk("inconsistent {%s} -> {%s} usage.\n", +		usage_str[prev_bit], usage_str[new_bit]); + +	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", +		curr->comm, task_pid_nr(curr), +		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, +		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, +		trace_hardirqs_enabled(curr), +		trace_softirqs_enabled(curr)); +	print_lock(this); + +	printk("{%s} state was registered at:\n", usage_str[prev_bit]); +	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + +	print_irqtrace_events(curr); +	printk("\nother info that might help us debug this:\n"); +	print_usage_bug_scenario(this); + +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +/* + * Print out an error if an invalid bit is set: + */ +static inline int +valid_state(struct task_struct *curr, struct held_lock *this, +	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +{ +	if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) +		return print_usage_bug(curr, this, bad_bit, new_bit); +	return 1; +} + +static int mark_lock(struct task_struct *curr, struct held_lock *this, +		     enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +/* + * print irq inversion bug: + */ +static int +print_irq_inversion_bug(struct task_struct *curr, +			struct lock_list *root, struct lock_list *other, +			struct held_lock *this, int forwards, +			const char *irqclass) +{ +	struct lock_list *entry = other; +	struct lock_list *middle = NULL; +	int depth; + +	if (!debug_locks_off_graph_unlock() || debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("=========================================================\n"); +	printk("[ INFO: possible irq lock inversion dependency detected ]\n"); +	print_kernel_ident(); +	printk("---------------------------------------------------------\n"); +	printk("%s/%d just changed the state of lock:\n", +		curr->comm, task_pid_nr(curr)); +	print_lock(this); +	if (forwards) +		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); +	else +		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); +	print_lock_name(other->class); +	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + +	printk("\nother info that might help us debug this:\n"); + +	/* Find a middle lock (if one exists) */ +	depth = get_lock_depth(other); +	do { +		if (depth == 0 && (entry != root)) { +			printk("lockdep:%s bad path found in chain graph\n", __func__); +			break; +		} +		middle = entry; +		entry = get_lock_parent(entry); +		depth--; +	} while (entry && entry != root && (depth >= 0)); +	if (forwards) +		print_irq_lock_scenario(root, other, +			middle ? middle->class : root->class, other->class); +	else +		print_irq_lock_scenario(other, root, +			middle ? middle->class : other->class, root->class); + +	lockdep_print_held_locks(curr); + +	printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); +	if (!save_trace(&root->trace)) +		return 0; +	print_shortest_lock_dependencies(other, root); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +/* + * Prove that in the forwards-direction subgraph starting at <this> + * there is no lock matching <mask>: + */ +static int +check_usage_forwards(struct task_struct *curr, struct held_lock *this, +		     enum lock_usage_bit bit, const char *irqclass) +{ +	int ret; +	struct lock_list root; +	struct lock_list *uninitialized_var(target_entry); + +	root.parent = NULL; +	root.class = hlock_class(this); +	ret = find_usage_forwards(&root, bit, &target_entry); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (ret == 1) +		return ret; + +	return print_irq_inversion_bug(curr, &root, target_entry, +					this, 1, irqclass); +} + +/* + * Prove that in the backwards-direction subgraph starting at <this> + * there is no lock matching <mask>: + */ +static int +check_usage_backwards(struct task_struct *curr, struct held_lock *this, +		      enum lock_usage_bit bit, const char *irqclass) +{ +	int ret; +	struct lock_list root; +	struct lock_list *uninitialized_var(target_entry); + +	root.parent = NULL; +	root.class = hlock_class(this); +	ret = find_usage_backwards(&root, bit, &target_entry); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (ret == 1) +		return ret; + +	return print_irq_inversion_bug(curr, &root, target_entry, +					this, 0, irqclass); +} + +void print_irqtrace_events(struct task_struct *curr) +{ +	printk("irq event stamp: %u\n", curr->irq_events); +	printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event); +	print_ip_sym(curr->hardirq_enable_ip); +	printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); +	print_ip_sym(curr->hardirq_disable_ip); +	printk("softirqs last  enabled at (%u): ", curr->softirq_enable_event); +	print_ip_sym(curr->softirq_enable_ip); +	printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); +	print_ip_sym(curr->softirq_disable_ip); +} + +static int HARDIRQ_verbose(struct lock_class *class) +{ +#if HARDIRQ_VERBOSE +	return class_filter(class); +#endif +	return 0; +} + +static int SOFTIRQ_verbose(struct lock_class *class) +{ +#if SOFTIRQ_VERBOSE +	return class_filter(class); +#endif +	return 0; +} + +static int RECLAIM_FS_verbose(struct lock_class *class) +{ +#if RECLAIM_VERBOSE +	return class_filter(class); +#endif +	return 0; +} + +#define STRICT_READ_CHECKS	1 + +static int (*state_verbose_f[])(struct lock_class *class) = { +#define LOCKDEP_STATE(__STATE) \ +	__STATE##_verbose, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline int state_verbose(enum lock_usage_bit bit, +				struct lock_class *class) +{ +	return state_verbose_f[bit >> 2](class); +} + +typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, +			     enum lock_usage_bit bit, const char *name); + +static int +mark_lock_irq(struct task_struct *curr, struct held_lock *this, +		enum lock_usage_bit new_bit) +{ +	int excl_bit = exclusive_bit(new_bit); +	int read = new_bit & 1; +	int dir = new_bit & 2; + +	/* +	 * mark USED_IN has to look forwards -- to ensure no dependency +	 * has ENABLED state, which would allow recursion deadlocks. +	 * +	 * mark ENABLED has to look backwards -- to ensure no dependee +	 * has USED_IN state, which, again, would allow  recursion deadlocks. +	 */ +	check_usage_f usage = dir ? +		check_usage_backwards : check_usage_forwards; + +	/* +	 * Validate that this particular lock does not have conflicting +	 * usage states. +	 */ +	if (!valid_state(curr, this, new_bit, excl_bit)) +		return 0; + +	/* +	 * Validate that the lock dependencies don't have conflicting usage +	 * states. +	 */ +	if ((!read || !dir || STRICT_READ_CHECKS) && +			!usage(curr, this, excl_bit, state_name(new_bit & ~1))) +		return 0; + +	/* +	 * Check for read in write conflicts +	 */ +	if (!read) { +		if (!valid_state(curr, this, new_bit, excl_bit + 1)) +			return 0; + +		if (STRICT_READ_CHECKS && +			!usage(curr, this, excl_bit + 1, +				state_name(new_bit + 1))) +			return 0; +	} + +	if (state_verbose(new_bit, hlock_class(this))) +		return 2; + +	return 1; +} + +enum mark_type { +#define LOCKDEP_STATE(__STATE)	__STATE, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +/* + * Mark all held locks with a usage bit: + */ +static int +mark_held_locks(struct task_struct *curr, enum mark_type mark) +{ +	enum lock_usage_bit usage_bit; +	struct held_lock *hlock; +	int i; + +	for (i = 0; i < curr->lockdep_depth; i++) { +		hlock = curr->held_locks + i; + +		usage_bit = 2 + (mark << 2); /* ENABLED */ +		if (hlock->read) +			usage_bit += 1; /* READ */ + +		BUG_ON(usage_bit >= LOCK_USAGE_STATES); + +		if (!hlock->check) +			continue; + +		if (!mark_lock(curr, hlock, usage_bit)) +			return 0; +	} + +	return 1; +} + +/* + * Hardirqs will be enabled: + */ +static void __trace_hardirqs_on_caller(unsigned long ip) +{ +	struct task_struct *curr = current; + +	/* we'll do an OFF -> ON transition: */ +	curr->hardirqs_enabled = 1; + +	/* +	 * We are going to turn hardirqs on, so set the +	 * usage bit for all held locks: +	 */ +	if (!mark_held_locks(curr, HARDIRQ)) +		return; +	/* +	 * If we have softirqs enabled, then set the usage +	 * bit for all held locks. (disabled hardirqs prevented +	 * this bit from being set before) +	 */ +	if (curr->softirqs_enabled) +		if (!mark_held_locks(curr, SOFTIRQ)) +			return; + +	curr->hardirq_enable_ip = ip; +	curr->hardirq_enable_event = ++curr->irq_events; +	debug_atomic_inc(hardirqs_on_events); +} + +__visible void trace_hardirqs_on_caller(unsigned long ip) +{ +	time_hardirqs_on(CALLER_ADDR0, ip); + +	if (unlikely(!debug_locks || current->lockdep_recursion)) +		return; + +	if (unlikely(current->hardirqs_enabled)) { +		/* +		 * Neither irq nor preemption are disabled here +		 * so this is racy by nature but losing one hit +		 * in a stat is not a big deal. +		 */ +		__debug_atomic_inc(redundant_hardirqs_on); +		return; +	} + +	/* +	 * We're enabling irqs and according to our state above irqs weren't +	 * already enabled, yet we find the hardware thinks they are in fact +	 * enabled.. someone messed up their IRQ state tracing. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return; + +	/* +	 * See the fine text that goes along with this variable definition. +	 */ +	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) +		return; + +	/* +	 * Can't allow enabling interrupts while in an interrupt handler, +	 * that's general bad form and such. Recursion, limited stack etc.. +	 */ +	if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) +		return; + +	current->lockdep_recursion = 1; +	__trace_hardirqs_on_caller(ip); +	current->lockdep_recursion = 0; +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_on(void) +{ +	trace_hardirqs_on_caller(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +/* + * Hardirqs were disabled: + */ +__visible void trace_hardirqs_off_caller(unsigned long ip) +{ +	struct task_struct *curr = current; + +	time_hardirqs_off(CALLER_ADDR0, ip); + +	if (unlikely(!debug_locks || current->lockdep_recursion)) +		return; + +	/* +	 * So we're supposed to get called after you mask local IRQs, but for +	 * some reason the hardware doesn't quite think you did a proper job. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return; + +	if (curr->hardirqs_enabled) { +		/* +		 * We have done an ON -> OFF transition: +		 */ +		curr->hardirqs_enabled = 0; +		curr->hardirq_disable_ip = ip; +		curr->hardirq_disable_event = ++curr->irq_events; +		debug_atomic_inc(hardirqs_off_events); +	} else +		debug_atomic_inc(redundant_hardirqs_off); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +void trace_hardirqs_off(void) +{ +	trace_hardirqs_off_caller(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +/* + * Softirqs will be enabled: + */ +void trace_softirqs_on(unsigned long ip) +{ +	struct task_struct *curr = current; + +	if (unlikely(!debug_locks || current->lockdep_recursion)) +		return; + +	/* +	 * We fancy IRQs being disabled here, see softirq.c, avoids +	 * funny state and nesting things. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return; + +	if (curr->softirqs_enabled) { +		debug_atomic_inc(redundant_softirqs_on); +		return; +	} + +	current->lockdep_recursion = 1; +	/* +	 * We'll do an OFF -> ON transition: +	 */ +	curr->softirqs_enabled = 1; +	curr->softirq_enable_ip = ip; +	curr->softirq_enable_event = ++curr->irq_events; +	debug_atomic_inc(softirqs_on_events); +	/* +	 * We are going to turn softirqs on, so set the +	 * usage bit for all held locks, if hardirqs are +	 * enabled too: +	 */ +	if (curr->hardirqs_enabled) +		mark_held_locks(curr, SOFTIRQ); +	current->lockdep_recursion = 0; +} + +/* + * Softirqs were disabled: + */ +void trace_softirqs_off(unsigned long ip) +{ +	struct task_struct *curr = current; + +	if (unlikely(!debug_locks || current->lockdep_recursion)) +		return; + +	/* +	 * We fancy IRQs being disabled here, see softirq.c +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return; + +	if (curr->softirqs_enabled) { +		/* +		 * We have done an ON -> OFF transition: +		 */ +		curr->softirqs_enabled = 0; +		curr->softirq_disable_ip = ip; +		curr->softirq_disable_event = ++curr->irq_events; +		debug_atomic_inc(softirqs_off_events); +		/* +		 * Whoops, we wanted softirqs off, so why aren't they? +		 */ +		DEBUG_LOCKS_WARN_ON(!softirq_count()); +	} else +		debug_atomic_inc(redundant_softirqs_off); +} + +static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) +{ +	struct task_struct *curr = current; + +	if (unlikely(!debug_locks)) +		return; + +	/* no reclaim without waiting on it */ +	if (!(gfp_mask & __GFP_WAIT)) +		return; + +	/* this guy won't enter reclaim */ +	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) +		return; + +	/* We're only interested __GFP_FS allocations for now */ +	if (!(gfp_mask & __GFP_FS)) +		return; + +	/* +	 * Oi! Can't be having __GFP_FS allocations with IRQs disabled. +	 */ +	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) +		return; + +	mark_held_locks(curr, RECLAIM_FS); +} + +static void check_flags(unsigned long flags); + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +	unsigned long flags; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); +	current->lockdep_recursion = 1; +	__lockdep_trace_alloc(gfp_mask, flags); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} + +static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +{ +	/* +	 * If non-trylock use in a hardirq or softirq context, then +	 * mark the lock as used in these contexts: +	 */ +	if (!hlock->trylock) { +		if (hlock->read) { +			if (curr->hardirq_context) +				if (!mark_lock(curr, hlock, +						LOCK_USED_IN_HARDIRQ_READ)) +					return 0; +			if (curr->softirq_context) +				if (!mark_lock(curr, hlock, +						LOCK_USED_IN_SOFTIRQ_READ)) +					return 0; +		} else { +			if (curr->hardirq_context) +				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) +					return 0; +			if (curr->softirq_context) +				if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) +					return 0; +		} +	} +	if (!hlock->hardirqs_off) { +		if (hlock->read) { +			if (!mark_lock(curr, hlock, +					LOCK_ENABLED_HARDIRQ_READ)) +				return 0; +			if (curr->softirqs_enabled) +				if (!mark_lock(curr, hlock, +						LOCK_ENABLED_SOFTIRQ_READ)) +					return 0; +		} else { +			if (!mark_lock(curr, hlock, +					LOCK_ENABLED_HARDIRQ)) +				return 0; +			if (curr->softirqs_enabled) +				if (!mark_lock(curr, hlock, +						LOCK_ENABLED_SOFTIRQ)) +					return 0; +		} +	} + +	/* +	 * We reuse the irq context infrastructure more broadly as a general +	 * context checking code. This tests GFP_FS recursion (a lock taken +	 * during reclaim for a GFP_FS allocation is held over a GFP_FS +	 * allocation). +	 */ +	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { +		if (hlock->read) { +			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) +					return 0; +		} else { +			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) +					return 0; +		} +	} + +	return 1; +} + +static int separate_irq_context(struct task_struct *curr, +		struct held_lock *hlock) +{ +	unsigned int depth = curr->lockdep_depth; + +	/* +	 * Keep track of points where we cross into an interrupt context: +	 */ +	hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + +				curr->softirq_context; +	if (depth) { +		struct held_lock *prev_hlock; + +		prev_hlock = curr->held_locks + depth-1; +		/* +		 * If we cross into another context, reset the +		 * hash key (this also prevents the checking and the +		 * adding of the dependency to 'prev'): +		 */ +		if (prev_hlock->irq_context != hlock->irq_context) +			return 1; +	} +	return 0; +} + +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +static inline +int mark_lock_irq(struct task_struct *curr, struct held_lock *this, +		enum lock_usage_bit new_bit) +{ +	WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ +	return 1; +} + +static inline int mark_irqflags(struct task_struct *curr, +		struct held_lock *hlock) +{ +	return 1; +} + +static inline int separate_irq_context(struct task_struct *curr, +		struct held_lock *hlock) +{ +	return 0; +} + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +} + +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +/* + * Mark a lock with a usage bit, and validate the state transition: + */ +static int mark_lock(struct task_struct *curr, struct held_lock *this, +			     enum lock_usage_bit new_bit) +{ +	unsigned int new_mask = 1 << new_bit, ret = 1; + +	/* +	 * If already set then do not dirty the cacheline, +	 * nor do any checks: +	 */ +	if (likely(hlock_class(this)->usage_mask & new_mask)) +		return 1; + +	if (!graph_lock()) +		return 0; +	/* +	 * Make sure we didn't race: +	 */ +	if (unlikely(hlock_class(this)->usage_mask & new_mask)) { +		graph_unlock(); +		return 1; +	} + +	hlock_class(this)->usage_mask |= new_mask; + +	if (!save_trace(hlock_class(this)->usage_traces + new_bit)) +		return 0; + +	switch (new_bit) { +#define LOCKDEP_STATE(__STATE)			\ +	case LOCK_USED_IN_##__STATE:		\ +	case LOCK_USED_IN_##__STATE##_READ:	\ +	case LOCK_ENABLED_##__STATE:		\ +	case LOCK_ENABLED_##__STATE##_READ: +#include "lockdep_states.h" +#undef LOCKDEP_STATE +		ret = mark_lock_irq(curr, this, new_bit); +		if (!ret) +			return 0; +		break; +	case LOCK_USED: +		debug_atomic_dec(nr_unused_locks); +		break; +	default: +		if (!debug_locks_off_graph_unlock()) +			return 0; +		WARN_ON(1); +		return 0; +	} + +	graph_unlock(); + +	/* +	 * We must printk outside of the graph_lock: +	 */ +	if (ret == 2) { +		printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); +		print_lock(this); +		print_irqtrace_events(curr); +		dump_stack(); +	} + +	return ret; +} + +/* + * Initialize a lock instance's lock-class mapping info: + */ +void lockdep_init_map(struct lockdep_map *lock, const char *name, +		      struct lock_class_key *key, int subclass) +{ +	int i; + +	kmemcheck_mark_initialized(lock, sizeof(*lock)); + +	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) +		lock->class_cache[i] = NULL; + +#ifdef CONFIG_LOCK_STAT +	lock->cpu = raw_smp_processor_id(); +#endif + +	/* +	 * Can't be having no nameless bastards around this place! +	 */ +	if (DEBUG_LOCKS_WARN_ON(!name)) { +		lock->name = "NULL"; +		return; +	} + +	lock->name = name; + +	/* +	 * No key, no joy, we need to hash something. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!key)) +		return; +	/* +	 * Sanity check, the lock-class key must be persistent: +	 */ +	if (!static_obj(key)) { +		printk("BUG: key %p not in .data!\n", key); +		/* +		 * What it says above ^^^^^, I suggest you read it. +		 */ +		DEBUG_LOCKS_WARN_ON(1); +		return; +	} +	lock->key = key; + +	if (unlikely(!debug_locks)) +		return; + +	if (subclass) +		register_lock_class(lock, subclass, 1); +} +EXPORT_SYMBOL_GPL(lockdep_init_map); + +struct lock_class_key __lockdep_no_validate__; +EXPORT_SYMBOL_GPL(__lockdep_no_validate__); + +static int +print_lock_nested_lock_not_held(struct task_struct *curr, +				struct held_lock *hlock, +				unsigned long ip) +{ +	if (!debug_locks_off()) +		return 0; +	if (debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("==================================\n"); +	printk("[ BUG: Nested lock was not taken ]\n"); +	print_kernel_ident(); +	printk("----------------------------------\n"); + +	printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); +	print_lock(hlock); + +	printk("\nbut this task is not holding:\n"); +	printk("%s\n", hlock->nest_lock->name); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	printk("\nother info that might help us debug this:\n"); +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +static int __lock_is_held(struct lockdep_map *lock); + +/* + * This gets called for every mutex_lock*()/spin_lock*() operation. + * We maintain the dependency maps and validate the locking attempt: + */ +static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, +			  int trylock, int read, int check, int hardirqs_off, +			  struct lockdep_map *nest_lock, unsigned long ip, +			  int references) +{ +	struct task_struct *curr = current; +	struct lock_class *class = NULL; +	struct held_lock *hlock; +	unsigned int depth, id; +	int chain_head = 0; +	int class_idx; +	u64 chain_key; + +	if (unlikely(!debug_locks)) +		return 0; + +	/* +	 * Lockdep should run with IRQs disabled, otherwise we could +	 * get an interrupt which would want to take locks, which would +	 * end up in lockdep and have you got a head-ache already? +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return 0; + +	if (!prove_locking || lock->key == &__lockdep_no_validate__) +		check = 0; + +	if (subclass < NR_LOCKDEP_CACHING_CLASSES) +		class = lock->class_cache[subclass]; +	/* +	 * Not cached? +	 */ +	if (unlikely(!class)) { +		class = register_lock_class(lock, subclass, 0); +		if (!class) +			return 0; +	} +	atomic_inc((atomic_t *)&class->ops); +	if (very_verbose(class)) { +		printk("\nacquire class [%p] %s", class->key, class->name); +		if (class->name_version > 1) +			printk("#%d", class->name_version); +		printk("\n"); +		dump_stack(); +	} + +	/* +	 * Add the lock to the list of currently held locks. +	 * (we dont increase the depth just yet, up until the +	 * dependency checks are done) +	 */ +	depth = curr->lockdep_depth; +	/* +	 * Ran out of static storage for our per-task lock stack again have we? +	 */ +	if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) +		return 0; + +	class_idx = class - lock_classes + 1; + +	if (depth) { +		hlock = curr->held_locks + depth - 1; +		if (hlock->class_idx == class_idx && nest_lock) { +			if (hlock->references) +				hlock->references++; +			else +				hlock->references = 2; + +			return 1; +		} +	} + +	hlock = curr->held_locks + depth; +	/* +	 * Plain impossible, we just registered it and checked it weren't no +	 * NULL like.. I bet this mushroom I ate was good! +	 */ +	if (DEBUG_LOCKS_WARN_ON(!class)) +		return 0; +	hlock->class_idx = class_idx; +	hlock->acquire_ip = ip; +	hlock->instance = lock; +	hlock->nest_lock = nest_lock; +	hlock->trylock = trylock; +	hlock->read = read; +	hlock->check = check; +	hlock->hardirqs_off = !!hardirqs_off; +	hlock->references = references; +#ifdef CONFIG_LOCK_STAT +	hlock->waittime_stamp = 0; +	hlock->holdtime_stamp = lockstat_clock(); +#endif + +	if (check && !mark_irqflags(curr, hlock)) +		return 0; + +	/* mark it as used: */ +	if (!mark_lock(curr, hlock, LOCK_USED)) +		return 0; + +	/* +	 * Calculate the chain hash: it's the combined hash of all the +	 * lock keys along the dependency chain. We save the hash value +	 * at every step so that we can get the current hash easily +	 * after unlock. The chain hash is then used to cache dependency +	 * results. +	 * +	 * The 'key ID' is what is the most compact key value to drive +	 * the hash, not class->key. +	 */ +	id = class - lock_classes; +	/* +	 * Whoops, we did it again.. ran straight out of our static allocation. +	 */ +	if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) +		return 0; + +	chain_key = curr->curr_chain_key; +	if (!depth) { +		/* +		 * How can we have a chain hash when we ain't got no keys?! +		 */ +		if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) +			return 0; +		chain_head = 1; +	} + +	hlock->prev_chain_key = chain_key; +	if (separate_irq_context(curr, hlock)) { +		chain_key = 0; +		chain_head = 1; +	} +	chain_key = iterate_chain_key(chain_key, id); + +	if (nest_lock && !__lock_is_held(nest_lock)) +		return print_lock_nested_lock_not_held(curr, hlock, ip); + +	if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) +		return 0; + +	curr->curr_chain_key = chain_key; +	curr->lockdep_depth++; +	check_chain_key(curr); +#ifdef CONFIG_DEBUG_LOCKDEP +	if (unlikely(!debug_locks)) +		return 0; +#endif +	if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { +		debug_locks_off(); +		print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); +		printk(KERN_DEBUG "depth: %i  max: %lu!\n", +		       curr->lockdep_depth, MAX_LOCK_DEPTH); + +		lockdep_print_held_locks(current); +		debug_show_all_locks(); +		dump_stack(); + +		return 0; +	} + +	if (unlikely(curr->lockdep_depth > max_lockdep_depth)) +		max_lockdep_depth = curr->lockdep_depth; + +	return 1; +} + +static int +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, +			   unsigned long ip) +{ +	if (!debug_locks_off()) +		return 0; +	if (debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("=====================================\n"); +	printk("[ BUG: bad unlock balance detected! ]\n"); +	print_kernel_ident(); +	printk("-------------------------------------\n"); +	printk("%s/%d is trying to release lock (", +		curr->comm, task_pid_nr(curr)); +	print_lockdep_cache(lock); +	printk(") at:\n"); +	print_ip_sym(ip); +	printk("but there are no more locks to release!\n"); +	printk("\nother info that might help us debug this:\n"); +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +/* + * Common debugging checks for both nested and non-nested unlock: + */ +static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, +			unsigned long ip) +{ +	if (unlikely(!debug_locks)) +		return 0; +	/* +	 * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) +		return 0; + +	if (curr->lockdep_depth <= 0) +		return print_unlock_imbalance_bug(curr, lock, ip); + +	return 1; +} + +static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +{ +	if (hlock->instance == lock) +		return 1; + +	if (hlock->references) { +		struct lock_class *class = lock->class_cache[0]; + +		if (!class) +			class = look_up_lock_class(lock, 0); + +		/* +		 * If look_up_lock_class() failed to find a class, we're trying +		 * to test if we hold a lock that has never yet been acquired. +		 * Clearly if the lock hasn't been acquired _ever_, we're not +		 * holding it either, so report failure. +		 */ +		if (!class) +			return 0; + +		/* +		 * References, but not a lock we're actually ref-counting? +		 * State got messed up, follow the sites that change ->references +		 * and try to make sense of it. +		 */ +		if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) +			return 0; + +		if (hlock->class_idx == class - lock_classes + 1) +			return 1; +	} + +	return 0; +} + +static int +__lock_set_class(struct lockdep_map *lock, const char *name, +		 struct lock_class_key *key, unsigned int subclass, +		 unsigned long ip) +{ +	struct task_struct *curr = current; +	struct held_lock *hlock, *prev_hlock; +	struct lock_class *class; +	unsigned int depth; +	int i; + +	depth = curr->lockdep_depth; +	/* +	 * This function is about (re)setting the class of a held lock, +	 * yet we're not actually holding any locks. Naughty user! +	 */ +	if (DEBUG_LOCKS_WARN_ON(!depth)) +		return 0; + +	prev_hlock = NULL; +	for (i = depth-1; i >= 0; i--) { +		hlock = curr->held_locks + i; +		/* +		 * We must not cross into another context: +		 */ +		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) +			break; +		if (match_held_lock(hlock, lock)) +			goto found_it; +		prev_hlock = hlock; +	} +	return print_unlock_imbalance_bug(curr, lock, ip); + +found_it: +	lockdep_init_map(lock, name, key, 0); +	class = register_lock_class(lock, subclass, 0); +	hlock->class_idx = class - lock_classes + 1; + +	curr->lockdep_depth = i; +	curr->curr_chain_key = hlock->prev_chain_key; + +	for (; i < depth; i++) { +		hlock = curr->held_locks + i; +		if (!__lock_acquire(hlock->instance, +			hlock_class(hlock)->subclass, hlock->trylock, +				hlock->read, hlock->check, hlock->hardirqs_off, +				hlock->nest_lock, hlock->acquire_ip, +				hlock->references)) +			return 0; +	} + +	/* +	 * I took it apart and put it back together again, except now I have +	 * these 'spare' parts.. where shall I put them. +	 */ +	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) +		return 0; +	return 1; +} + +/* + * Remove the lock to the list of currently held locks in a + * potentially non-nested (out of order) manner. This is a + * relatively rare operation, as all the unlock APIs default + * to nested mode (which uses lock_release()): + */ +static int +lock_release_non_nested(struct task_struct *curr, +			struct lockdep_map *lock, unsigned long ip) +{ +	struct held_lock *hlock, *prev_hlock; +	unsigned int depth; +	int i; + +	/* +	 * Check whether the lock exists in the current stack +	 * of held locks: +	 */ +	depth = curr->lockdep_depth; +	/* +	 * So we're all set to release this lock.. wait what lock? We don't +	 * own any locks, you've been drinking again? +	 */ +	if (DEBUG_LOCKS_WARN_ON(!depth)) +		return 0; + +	prev_hlock = NULL; +	for (i = depth-1; i >= 0; i--) { +		hlock = curr->held_locks + i; +		/* +		 * We must not cross into another context: +		 */ +		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) +			break; +		if (match_held_lock(hlock, lock)) +			goto found_it; +		prev_hlock = hlock; +	} +	return print_unlock_imbalance_bug(curr, lock, ip); + +found_it: +	if (hlock->instance == lock) +		lock_release_holdtime(hlock); + +	if (hlock->references) { +		hlock->references--; +		if (hlock->references) { +			/* +			 * We had, and after removing one, still have +			 * references, the current lock stack is still +			 * valid. We're done! +			 */ +			return 1; +		} +	} + +	/* +	 * We have the right lock to unlock, 'hlock' points to it. +	 * Now we remove it from the stack, and add back the other +	 * entries (if any), recalculating the hash along the way: +	 */ + +	curr->lockdep_depth = i; +	curr->curr_chain_key = hlock->prev_chain_key; + +	for (i++; i < depth; i++) { +		hlock = curr->held_locks + i; +		if (!__lock_acquire(hlock->instance, +			hlock_class(hlock)->subclass, hlock->trylock, +				hlock->read, hlock->check, hlock->hardirqs_off, +				hlock->nest_lock, hlock->acquire_ip, +				hlock->references)) +			return 0; +	} + +	/* +	 * We had N bottles of beer on the wall, we drank one, but now +	 * there's not N-1 bottles of beer left on the wall... +	 */ +	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) +		return 0; +	return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static int lock_release_nested(struct task_struct *curr, +			       struct lockdep_map *lock, unsigned long ip) +{ +	struct held_lock *hlock; +	unsigned int depth; + +	/* +	 * Pop off the top of the lock stack: +	 */ +	depth = curr->lockdep_depth - 1; +	hlock = curr->held_locks + depth; + +	/* +	 * Is the unlock non-nested: +	 */ +	if (hlock->instance != lock || hlock->references) +		return lock_release_non_nested(curr, lock, ip); +	curr->lockdep_depth--; + +	/* +	 * No more locks, but somehow we've got hash left over, who left it? +	 */ +	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) +		return 0; + +	curr->curr_chain_key = hlock->prev_chain_key; + +	lock_release_holdtime(hlock); + +#ifdef CONFIG_DEBUG_LOCKDEP +	hlock->prev_chain_key = 0; +	hlock->class_idx = 0; +	hlock->acquire_ip = 0; +	hlock->irq_context = 0; +#endif +	return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static void +__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +{ +	struct task_struct *curr = current; + +	if (!check_unlock(curr, lock, ip)) +		return; + +	if (nested) { +		if (!lock_release_nested(curr, lock, ip)) +			return; +	} else { +		if (!lock_release_non_nested(curr, lock, ip)) +			return; +	} + +	check_chain_key(curr); +} + +static int __lock_is_held(struct lockdep_map *lock) +{ +	struct task_struct *curr = current; +	int i; + +	for (i = 0; i < curr->lockdep_depth; i++) { +		struct held_lock *hlock = curr->held_locks + i; + +		if (match_held_lock(hlock, lock)) +			return 1; +	} + +	return 0; +} + +/* + * Check whether we follow the irq-flags state precisely: + */ +static void check_flags(unsigned long flags) +{ +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ +    defined(CONFIG_TRACE_IRQFLAGS) +	if (!debug_locks) +		return; + +	if (irqs_disabled_flags(flags)) { +		if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { +			printk("possible reason: unannotated irqs-off.\n"); +		} +	} else { +		if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { +			printk("possible reason: unannotated irqs-on.\n"); +		} +	} + +	/* +	 * We dont accurately track softirq state in e.g. +	 * hardirq contexts (such as on 4KSTACKS), so only +	 * check if not in hardirq contexts: +	 */ +	if (!hardirq_count()) { +		if (softirq_count()) { +			/* like the above, but with softirqs */ +			DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); +		} else { +			/* lick the above, does it taste good? */ +			DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); +		} +	} + +	if (!debug_locks) +		print_irqtrace_events(current); +#endif +} + +void lock_set_class(struct lockdep_map *lock, const char *name, +		    struct lock_class_key *key, unsigned int subclass, +		    unsigned long ip) +{ +	unsigned long flags; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	current->lockdep_recursion = 1; +	check_flags(flags); +	if (__lock_set_class(lock, name, key, subclass, ip)) +		check_chain_key(current); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_set_class); + +/* + * We are not always called with irqs disabled - do that here, + * and also avoid lockdep recursion: + */ +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, +			  int trylock, int read, int check, +			  struct lockdep_map *nest_lock, unsigned long ip) +{ +	unsigned long flags; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); + +	current->lockdep_recursion = 1; +	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); +	__lock_acquire(lock, subclass, trylock, read, check, +		       irqs_disabled_flags(flags), nest_lock, ip, 0); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquire); + +void lock_release(struct lockdep_map *lock, int nested, +			  unsigned long ip) +{ +	unsigned long flags; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); +	current->lockdep_recursion = 1; +	trace_lock_release(lock, ip); +	__lock_release(lock, nested, ip); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_release); + +int lock_is_held(struct lockdep_map *lock) +{ +	unsigned long flags; +	int ret = 0; + +	if (unlikely(current->lockdep_recursion)) +		return 1; /* avoid false negative lockdep_assert_held() */ + +	raw_local_irq_save(flags); +	check_flags(flags); + +	current->lockdep_recursion = 1; +	ret = __lock_is_held(lock); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); + +	return ret; +} +EXPORT_SYMBOL_GPL(lock_is_held); + +void lockdep_set_current_reclaim_state(gfp_t gfp_mask) +{ +	current->lockdep_reclaim_gfp = gfp_mask; +} + +void lockdep_clear_current_reclaim_state(void) +{ +	current->lockdep_reclaim_gfp = 0; +} + +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, +			   unsigned long ip) +{ +	if (!debug_locks_off()) +		return 0; +	if (debug_locks_silent) +		return 0; + +	printk("\n"); +	printk("=================================\n"); +	printk("[ BUG: bad contention detected! ]\n"); +	print_kernel_ident(); +	printk("---------------------------------\n"); +	printk("%s/%d is trying to contend lock (", +		curr->comm, task_pid_nr(curr)); +	print_lockdep_cache(lock); +	printk(") at:\n"); +	print_ip_sym(ip); +	printk("but there are no locks held!\n"); +	printk("\nother info that might help us debug this:\n"); +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); + +	return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ +	struct task_struct *curr = current; +	struct held_lock *hlock, *prev_hlock; +	struct lock_class_stats *stats; +	unsigned int depth; +	int i, contention_point, contending_point; + +	depth = curr->lockdep_depth; +	/* +	 * Whee, we contended on this lock, except it seems we're not +	 * actually trying to acquire anything much at all.. +	 */ +	if (DEBUG_LOCKS_WARN_ON(!depth)) +		return; + +	prev_hlock = NULL; +	for (i = depth-1; i >= 0; i--) { +		hlock = curr->held_locks + i; +		/* +		 * We must not cross into another context: +		 */ +		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) +			break; +		if (match_held_lock(hlock, lock)) +			goto found_it; +		prev_hlock = hlock; +	} +	print_lock_contention_bug(curr, lock, ip); +	return; + +found_it: +	if (hlock->instance != lock) +		return; + +	hlock->waittime_stamp = lockstat_clock(); + +	contention_point = lock_point(hlock_class(hlock)->contention_point, ip); +	contending_point = lock_point(hlock_class(hlock)->contending_point, +				      lock->ip); + +	stats = get_lock_stats(hlock_class(hlock)); +	if (contention_point < LOCKSTAT_POINTS) +		stats->contention_point[contention_point]++; +	if (contending_point < LOCKSTAT_POINTS) +		stats->contending_point[contending_point]++; +	if (lock->cpu != smp_processor_id()) +		stats->bounces[bounce_contended + !!hlock->read]++; +	put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock, unsigned long ip) +{ +	struct task_struct *curr = current; +	struct held_lock *hlock, *prev_hlock; +	struct lock_class_stats *stats; +	unsigned int depth; +	u64 now, waittime = 0; +	int i, cpu; + +	depth = curr->lockdep_depth; +	/* +	 * Yay, we acquired ownership of this lock we didn't try to +	 * acquire, how the heck did that happen? +	 */ +	if (DEBUG_LOCKS_WARN_ON(!depth)) +		return; + +	prev_hlock = NULL; +	for (i = depth-1; i >= 0; i--) { +		hlock = curr->held_locks + i; +		/* +		 * We must not cross into another context: +		 */ +		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) +			break; +		if (match_held_lock(hlock, lock)) +			goto found_it; +		prev_hlock = hlock; +	} +	print_lock_contention_bug(curr, lock, _RET_IP_); +	return; + +found_it: +	if (hlock->instance != lock) +		return; + +	cpu = smp_processor_id(); +	if (hlock->waittime_stamp) { +		now = lockstat_clock(); +		waittime = now - hlock->waittime_stamp; +		hlock->holdtime_stamp = now; +	} + +	trace_lock_acquired(lock, ip); + +	stats = get_lock_stats(hlock_class(hlock)); +	if (waittime) { +		if (hlock->read) +			lock_time_inc(&stats->read_waittime, waittime); +		else +			lock_time_inc(&stats->write_waittime, waittime); +	} +	if (lock->cpu != cpu) +		stats->bounces[bounce_acquired + !!hlock->read]++; +	put_lock_stats(stats); + +	lock->cpu = cpu; +	lock->ip = ip; +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ +	unsigned long flags; + +	if (unlikely(!lock_stat)) +		return; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); +	current->lockdep_recursion = 1; +	trace_lock_contended(lock, ip); +	__lock_contended(lock, ip); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock, unsigned long ip) +{ +	unsigned long flags; + +	if (unlikely(!lock_stat)) +		return; + +	if (unlikely(current->lockdep_recursion)) +		return; + +	raw_local_irq_save(flags); +	check_flags(flags); +	current->lockdep_recursion = 1; +	__lock_acquired(lock, ip); +	current->lockdep_recursion = 0; +	raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + +/* + * Used by the testsuite, sanitize the validator state + * after a simulated failure: + */ + +void lockdep_reset(void) +{ +	unsigned long flags; +	int i; + +	raw_local_irq_save(flags); +	current->curr_chain_key = 0; +	current->lockdep_depth = 0; +	current->lockdep_recursion = 0; +	memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); +	nr_hardirq_chains = 0; +	nr_softirq_chains = 0; +	nr_process_chains = 0; +	debug_locks = 1; +	for (i = 0; i < CHAINHASH_SIZE; i++) +		INIT_LIST_HEAD(chainhash_table + i); +	raw_local_irq_restore(flags); +} + +static void zap_class(struct lock_class *class) +{ +	int i; + +	/* +	 * Remove all dependencies this lock is +	 * involved in: +	 */ +	for (i = 0; i < nr_list_entries; i++) { +		if (list_entries[i].class == class) +			list_del_rcu(&list_entries[i].entry); +	} +	/* +	 * Unhash the class and remove it from the all_lock_classes list: +	 */ +	list_del_rcu(&class->hash_entry); +	list_del_rcu(&class->lock_entry); + +	class->key = NULL; +} + +static inline int within(const void *addr, void *start, unsigned long size) +{ +	return addr >= start && addr < start + size; +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ +	struct lock_class *class, *next; +	struct list_head *head; +	unsigned long flags; +	int i; +	int locked; + +	raw_local_irq_save(flags); +	locked = graph_lock(); + +	/* +	 * Unhash all classes that were created by this module: +	 */ +	for (i = 0; i < CLASSHASH_SIZE; i++) { +		head = classhash_table + i; +		if (list_empty(head)) +			continue; +		list_for_each_entry_safe(class, next, head, hash_entry) { +			if (within(class->key, start, size)) +				zap_class(class); +			else if (within(class->name, start, size)) +				zap_class(class); +		} +	} + +	if (locked) +		graph_unlock(); +	raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ +	struct lock_class *class, *next; +	struct list_head *head; +	unsigned long flags; +	int i, j; +	int locked; + +	raw_local_irq_save(flags); + +	/* +	 * Remove all classes this lock might have: +	 */ +	for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { +		/* +		 * If the class exists we look it up and zap it: +		 */ +		class = look_up_lock_class(lock, j); +		if (class) +			zap_class(class); +	} +	/* +	 * Debug check: in the end all mapped classes should +	 * be gone. +	 */ +	locked = graph_lock(); +	for (i = 0; i < CLASSHASH_SIZE; i++) { +		head = classhash_table + i; +		if (list_empty(head)) +			continue; +		list_for_each_entry_safe(class, next, head, hash_entry) { +			int match = 0; + +			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) +				match |= class == lock->class_cache[j]; + +			if (unlikely(match)) { +				if (debug_locks_off_graph_unlock()) { +					/* +					 * We all just reset everything, how did it match? +					 */ +					WARN_ON(1); +				} +				goto out_restore; +			} +		} +	} +	if (locked) +		graph_unlock(); + +out_restore: +	raw_local_irq_restore(flags); +} + +void lockdep_init(void) +{ +	int i; + +	/* +	 * Some architectures have their own start_kernel() +	 * code which calls lockdep_init(), while we also +	 * call lockdep_init() from the start_kernel() itself, +	 * and we want to initialize the hashes only once: +	 */ +	if (lockdep_initialized) +		return; + +	for (i = 0; i < CLASSHASH_SIZE; i++) +		INIT_LIST_HEAD(classhash_table + i); + +	for (i = 0; i < CHAINHASH_SIZE; i++) +		INIT_LIST_HEAD(chainhash_table + i); + +	lockdep_initialized = 1; +} + +void __init lockdep_info(void) +{ +	printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + +	printk("... MAX_LOCKDEP_SUBCLASSES:  %lu\n", MAX_LOCKDEP_SUBCLASSES); +	printk("... MAX_LOCK_DEPTH:          %lu\n", MAX_LOCK_DEPTH); +	printk("... MAX_LOCKDEP_KEYS:        %lu\n", MAX_LOCKDEP_KEYS); +	printk("... CLASSHASH_SIZE:          %lu\n", CLASSHASH_SIZE); +	printk("... MAX_LOCKDEP_ENTRIES:     %lu\n", MAX_LOCKDEP_ENTRIES); +	printk("... MAX_LOCKDEP_CHAINS:      %lu\n", MAX_LOCKDEP_CHAINS); +	printk("... CHAINHASH_SIZE:          %lu\n", CHAINHASH_SIZE); + +	printk(" memory used by lock dependency info: %lu kB\n", +		(sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + +		sizeof(struct list_head) * CLASSHASH_SIZE + +		sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + +		sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + +		sizeof(struct list_head) * CHAINHASH_SIZE +#ifdef CONFIG_PROVE_LOCKING +		+ sizeof(struct circular_queue) +#endif +		) / 1024 +		); + +	printk(" per task-struct memory footprint: %lu bytes\n", +		sizeof(struct held_lock) * MAX_LOCK_DEPTH); + +#ifdef CONFIG_DEBUG_LOCKDEP +	if (lockdep_init_error) { +		printk("WARNING: lockdep init error! lock-%s was acquired" +			"before lockdep_init\n", lock_init_error); +		printk("Call stack leading to lockdep invocation was:\n"); +		print_stack_trace(&lockdep_init_trace, 0); +	} +#endif +} + +static void +print_freed_lock_bug(struct task_struct *curr, const void *mem_from, +		     const void *mem_to, struct held_lock *hlock) +{ +	if (!debug_locks_off()) +		return; +	if (debug_locks_silent) +		return; + +	printk("\n"); +	printk("=========================\n"); +	printk("[ BUG: held lock freed! ]\n"); +	print_kernel_ident(); +	printk("-------------------------\n"); +	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", +		curr->comm, task_pid_nr(curr), mem_from, mem_to-1); +	print_lock(hlock); +	lockdep_print_held_locks(curr); + +	printk("\nstack backtrace:\n"); +	dump_stack(); +} + +static inline int not_in_range(const void* mem_from, unsigned long mem_len, +				const void* lock_from, unsigned long lock_len) +{ +	return lock_from + lock_len <= mem_from || +		mem_from + mem_len <= lock_from; +} + +/* + * Called when kernel memory is freed (or unmapped), or if a lock + * is destroyed or reinitialized - this code checks whether there is + * any held lock in the memory range of <from> to <to>: + */ +void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) +{ +	struct task_struct *curr = current; +	struct held_lock *hlock; +	unsigned long flags; +	int i; + +	if (unlikely(!debug_locks)) +		return; + +	local_irq_save(flags); +	for (i = 0; i < curr->lockdep_depth; i++) { +		hlock = curr->held_locks + i; + +		if (not_in_range(mem_from, mem_len, hlock->instance, +					sizeof(*hlock->instance))) +			continue; + +		print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); +		break; +	} +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); + +static void print_held_locks_bug(void) +{ +	if (!debug_locks_off()) +		return; +	if (debug_locks_silent) +		return; + +	printk("\n"); +	printk("=====================================\n"); +	printk("[ BUG: %s/%d still has locks held! ]\n", +	       current->comm, task_pid_nr(current)); +	print_kernel_ident(); +	printk("-------------------------------------\n"); +	lockdep_print_held_locks(current); +	printk("\nstack backtrace:\n"); +	dump_stack(); +} + +void debug_check_no_locks_held(void) +{ +	if (unlikely(current->lockdep_depth > 0)) +		print_held_locks_bug(); +} +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); + +#ifdef __KERNEL__ +void debug_show_all_locks(void) +{ +	struct task_struct *g, *p; +	int count = 10; +	int unlock = 1; + +	if (unlikely(!debug_locks)) { +		printk("INFO: lockdep is turned off.\n"); +		return; +	} +	printk("\nShowing all locks held in the system:\n"); + +	/* +	 * Here we try to get the tasklist_lock as hard as possible, +	 * if not successful after 2 seconds we ignore it (but keep +	 * trying). This is to enable a debug printout even if a +	 * tasklist_lock-holding task deadlocks or crashes. +	 */ +retry: +	if (!read_trylock(&tasklist_lock)) { +		if (count == 10) +			printk("hm, tasklist_lock locked, retrying... "); +		if (count) { +			count--; +			printk(" #%d", 10-count); +			mdelay(200); +			goto retry; +		} +		printk(" ignoring it.\n"); +		unlock = 0; +	} else { +		if (count != 10) +			printk(KERN_CONT " locked it.\n"); +	} + +	do_each_thread(g, p) { +		/* +		 * It's not reliable to print a task's held locks +		 * if it's not sleeping (or if it's not the current +		 * task): +		 */ +		if (p->state == TASK_RUNNING && p != current) +			continue; +		if (p->lockdep_depth) +			lockdep_print_held_locks(p); +		if (!unlock) +			if (read_trylock(&tasklist_lock)) +				unlock = 1; +	} while_each_thread(g, p); + +	printk("\n"); +	printk("=============================================\n\n"); + +	if (unlock) +		read_unlock(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(debug_show_all_locks); +#endif + +/* + * Careful: only use this function if you are sure that + * the task cannot run in parallel! + */ +void debug_show_held_locks(struct task_struct *task) +{ +	if (unlikely(!debug_locks)) { +		printk("INFO: lockdep is turned off.\n"); +		return; +	} +	lockdep_print_held_locks(task); +} +EXPORT_SYMBOL_GPL(debug_show_held_locks); + +asmlinkage __visible void lockdep_sys_exit(void) +{ +	struct task_struct *curr = current; + +	if (unlikely(curr->lockdep_depth)) { +		if (!debug_locks_off()) +			return; +		printk("\n"); +		printk("================================================\n"); +		printk("[ BUG: lock held when returning to user space! ]\n"); +		print_kernel_ident(); +		printk("------------------------------------------------\n"); +		printk("%s/%d is leaving the kernel with locks still held!\n", +				curr->comm, curr->pid); +		lockdep_print_held_locks(curr); +	} +} + +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) +{ +	struct task_struct *curr = current; + +#ifndef CONFIG_PROVE_RCU_REPEATEDLY +	if (!debug_locks_off()) +		return; +#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ +	/* Note: the following can be executed concurrently, so be careful. */ +	printk("\n"); +	printk("===============================\n"); +	printk("[ INFO: suspicious RCU usage. ]\n"); +	print_kernel_ident(); +	printk("-------------------------------\n"); +	printk("%s:%d %s!\n", file, line, s); +	printk("\nother info that might help us debug this:\n\n"); +	printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", +	       !rcu_lockdep_current_cpu_online() +			? "RCU used illegally from offline CPU!\n" +			: !rcu_is_watching() +				? "RCU used illegally from idle CPU!\n" +				: "", +	       rcu_scheduler_active, debug_locks); + +	/* +	 * If a CPU is in the RCU-free window in idle (ie: in the section +	 * between rcu_idle_enter() and rcu_idle_exit(), then RCU +	 * considers that CPU to be in an "extended quiescent state", +	 * which means that RCU will be completely ignoring that CPU. +	 * Therefore, rcu_read_lock() and friends have absolutely no +	 * effect on a CPU running in that state. In other words, even if +	 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well +	 * delete data structures out from under it.  RCU really has no +	 * choice here: we need to keep an RCU-free window in idle where +	 * the CPU may possibly enter into low power mode. This way we can +	 * notice an extended quiescent state to other CPUs that started a grace +	 * period. Otherwise we would delay any grace period as long as we run +	 * in the idle task. +	 * +	 * So complain bitterly if someone does call rcu_read_lock(), +	 * rcu_read_lock_bh() and so on from extended quiescent states. +	 */ +	if (!rcu_is_watching()) +		printk("RCU used illegally from extended quiescent state!\n"); + +	lockdep_print_held_locks(curr); +	printk("\nstack backtrace:\n"); +	dump_stack(); +} +EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h new file mode 100644 index 00000000000..51c4b24b632 --- /dev/null +++ b/kernel/locking/lockdep_internals.h @@ -0,0 +1,170 @@ +/* + * kernel/lockdep_internals.h + * + * Runtime locking correctness validator + * + * lockdep subsystem internal functions and variables. + */ + +/* + * Lock-class usage-state bits: + */ +enum lock_usage_bit { +#define LOCKDEP_STATE(__STATE)		\ +	LOCK_USED_IN_##__STATE,		\ +	LOCK_USED_IN_##__STATE##_READ,	\ +	LOCK_ENABLED_##__STATE,		\ +	LOCK_ENABLED_##__STATE##_READ, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +	LOCK_USED, +	LOCK_USAGE_STATES +}; + +/* + * Usage-state bitmasks: + */ +#define __LOCKF(__STATE)	LOCKF_##__STATE = (1 << LOCK_##__STATE), + +enum { +#define LOCKDEP_STATE(__STATE)						\ +	__LOCKF(USED_IN_##__STATE)					\ +	__LOCKF(USED_IN_##__STATE##_READ)				\ +	__LOCKF(ENABLED_##__STATE)					\ +	__LOCKF(ENABLED_##__STATE##_READ) +#include "lockdep_states.h" +#undef LOCKDEP_STATE +	__LOCKF(USED) +}; + +#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) +#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) + +#define LOCKF_ENABLED_IRQ_READ \ +		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) +#define LOCKF_USED_IN_IRQ_READ \ +		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) + +/* + * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies + * we track. + * + * We use the per-lock dependency maps in two ways: we grow it by adding + * every to-be-taken lock to all currently held lock's own dependency + * table (if it's not there yet), and we check it for lock order + * conflicts and deadlocks. + */ +#define MAX_LOCKDEP_ENTRIES	32768UL + +#define MAX_LOCKDEP_CHAINS_BITS	16 +#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the hash_lock. + */ +#define MAX_STACK_TRACE_ENTRIES	524288UL + +extern struct list_head all_lock_classes; +extern struct lock_chain lock_chains[]; + +#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) + +extern void get_usage_chars(struct lock_class *class, +			    char usage[LOCK_USAGE_CHARS]); + +extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); + +extern unsigned long nr_lock_classes; +extern unsigned long nr_list_entries; +extern unsigned long nr_lock_chains; +extern int nr_chain_hlocks; +extern unsigned long nr_stack_trace_entries; + +extern unsigned int nr_hardirq_chains; +extern unsigned int nr_softirq_chains; +extern unsigned int nr_process_chains; +extern unsigned int max_lockdep_depth; +extern unsigned int max_recursion_depth; + +extern unsigned int max_bfs_queue_depth; + +#ifdef CONFIG_PROVE_LOCKING +extern unsigned long lockdep_count_forward_deps(struct lock_class *); +extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#else +static inline unsigned long +lockdep_count_forward_deps(struct lock_class *class) +{ +	return 0; +} +static inline unsigned long +lockdep_count_backward_deps(struct lock_class *class) +{ +	return 0; +} +#endif + +#ifdef CONFIG_DEBUG_LOCKDEP + +#include <asm/local.h> +/* + * Various lockdep statistics. + * We want them per cpu as they are often accessed in fast path + * and we want to avoid too much cache bouncing. + */ +struct lockdep_stats { +	int	chain_lookup_hits; +	int	chain_lookup_misses; +	int	hardirqs_on_events; +	int	hardirqs_off_events; +	int	redundant_hardirqs_on; +	int	redundant_hardirqs_off; +	int	softirqs_on_events; +	int	softirqs_off_events; +	int	redundant_softirqs_on; +	int	redundant_softirqs_off; +	int	nr_unused_locks; +	int	nr_cyclic_checks; +	int	nr_cyclic_check_recursions; +	int	nr_find_usage_forwards_checks; +	int	nr_find_usage_forwards_recursions; +	int	nr_find_usage_backwards_checks; +	int	nr_find_usage_backwards_recursions; +}; + +DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); + +#define __debug_atomic_inc(ptr)					\ +	this_cpu_inc(lockdep_stats.ptr); + +#define debug_atomic_inc(ptr)			{		\ +	WARN_ON_ONCE(!irqs_disabled());				\ +	__this_cpu_inc(lockdep_stats.ptr);			\ +} + +#define debug_atomic_dec(ptr)			{		\ +	WARN_ON_ONCE(!irqs_disabled());				\ +	__this_cpu_dec(lockdep_stats.ptr);			\ +} + +#define debug_atomic_read(ptr)		({				\ +	struct lockdep_stats *__cpu_lockdep_stats;			\ +	unsigned long long __total = 0;					\ +	int __cpu;							\ +	for_each_possible_cpu(__cpu) {					\ +		__cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu);	\ +		__total += __cpu_lockdep_stats->ptr;			\ +	}								\ +	__total;							\ +}) +#else +# define __debug_atomic_inc(ptr)	do { } while (0) +# define debug_atomic_inc(ptr)		do { } while (0) +# define debug_atomic_dec(ptr)		do { } while (0) +# define debug_atomic_read(ptr)		0 +#endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c new file mode 100644 index 00000000000..ef43ac4bafb --- /dev/null +++ b/kernel/locking/lockdep_proc.c @@ -0,0 +1,683 @@ +/* + * kernel/lockdep_proc.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Code for /proc/lockdep and /proc/lockdep_stats: + * + */ +#include <linux/export.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/debug_locks.h> +#include <linux/vmalloc.h> +#include <linux/sort.h> +#include <asm/uaccess.h> +#include <asm/div64.h> + +#include "lockdep_internals.h" + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) +{ +	return seq_list_next(v, &all_lock_classes, pos); +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ +	return seq_list_start_head(&all_lock_classes, *pos); +} + +static void l_stop(struct seq_file *m, void *v) +{ +} + +static void print_name(struct seq_file *m, struct lock_class *class) +{ +	char str[KSYM_NAME_LEN]; +	const char *name = class->name; + +	if (!name) { +		name = __get_key_name(class->key, str); +		seq_printf(m, "%s", name); +	} else{ +		seq_printf(m, "%s", name); +		if (class->name_version > 1) +			seq_printf(m, "#%d", class->name_version); +		if (class->subclass) +			seq_printf(m, "/%d", class->subclass); +	} +} + +static int l_show(struct seq_file *m, void *v) +{ +	struct lock_class *class = list_entry(v, struct lock_class, lock_entry); +	struct lock_list *entry; +	char usage[LOCK_USAGE_CHARS]; + +	if (v == &all_lock_classes) { +		seq_printf(m, "all lock classes:\n"); +		return 0; +	} + +	seq_printf(m, "%p", class->key); +#ifdef CONFIG_DEBUG_LOCKDEP +	seq_printf(m, " OPS:%8ld", class->ops); +#endif +#ifdef CONFIG_PROVE_LOCKING +	seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); +	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); +#endif + +	get_usage_chars(class, usage); +	seq_printf(m, " %s", usage); + +	seq_printf(m, ": "); +	print_name(m, class); +	seq_puts(m, "\n"); + +	list_for_each_entry(entry, &class->locks_after, entry) { +		if (entry->distance == 1) { +			seq_printf(m, " -> [%p] ", entry->class->key); +			print_name(m, entry->class); +			seq_puts(m, "\n"); +		} +	} +	seq_puts(m, "\n"); + +	return 0; +} + +static const struct seq_operations lockdep_ops = { +	.start	= l_start, +	.next	= l_next, +	.stop	= l_stop, +	.show	= l_show, +}; + +static int lockdep_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &lockdep_ops); +} + +static const struct file_operations proc_lockdep_operations = { +	.open		= lockdep_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +}; + +#ifdef CONFIG_PROVE_LOCKING +static void *lc_start(struct seq_file *m, loff_t *pos) +{ +	if (*pos == 0) +		return SEQ_START_TOKEN; + +	if (*pos - 1 < nr_lock_chains) +		return lock_chains + (*pos - 1); + +	return NULL; +} + +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ +	(*pos)++; +	return lc_start(m, pos); +} + +static void lc_stop(struct seq_file *m, void *v) +{ +} + +static int lc_show(struct seq_file *m, void *v) +{ +	struct lock_chain *chain = v; +	struct lock_class *class; +	int i; + +	if (v == SEQ_START_TOKEN) { +		seq_printf(m, "all lock chains:\n"); +		return 0; +	} + +	seq_printf(m, "irq_context: %d\n", chain->irq_context); + +	for (i = 0; i < chain->depth; i++) { +		class = lock_chain_get_class(chain, i); +		if (!class->key) +			continue; + +		seq_printf(m, "[%p] ", class->key); +		print_name(m, class); +		seq_puts(m, "\n"); +	} +	seq_puts(m, "\n"); + +	return 0; +} + +static const struct seq_operations lockdep_chains_ops = { +	.start	= lc_start, +	.next	= lc_next, +	.stop	= lc_stop, +	.show	= lc_show, +}; + +static int lockdep_chains_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &lockdep_chains_ops); +} + +static const struct file_operations proc_lockdep_chains_operations = { +	.open		= lockdep_chains_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +}; +#endif /* CONFIG_PROVE_LOCKING */ + +static void lockdep_stats_debug_show(struct seq_file *m) +{ +#ifdef CONFIG_DEBUG_LOCKDEP +	unsigned long long hi1 = debug_atomic_read(hardirqs_on_events), +			   hi2 = debug_atomic_read(hardirqs_off_events), +			   hr1 = debug_atomic_read(redundant_hardirqs_on), +			   hr2 = debug_atomic_read(redundant_hardirqs_off), +			   si1 = debug_atomic_read(softirqs_on_events), +			   si2 = debug_atomic_read(softirqs_off_events), +			   sr1 = debug_atomic_read(redundant_softirqs_on), +			   sr2 = debug_atomic_read(redundant_softirqs_off); + +	seq_printf(m, " chain lookup misses:           %11llu\n", +		debug_atomic_read(chain_lookup_misses)); +	seq_printf(m, " chain lookup hits:             %11llu\n", +		debug_atomic_read(chain_lookup_hits)); +	seq_printf(m, " cyclic checks:                 %11llu\n", +		debug_atomic_read(nr_cyclic_checks)); +	seq_printf(m, " find-mask forwards checks:     %11llu\n", +		debug_atomic_read(nr_find_usage_forwards_checks)); +	seq_printf(m, " find-mask backwards checks:    %11llu\n", +		debug_atomic_read(nr_find_usage_backwards_checks)); + +	seq_printf(m, " hardirq on events:             %11llu\n", hi1); +	seq_printf(m, " hardirq off events:            %11llu\n", hi2); +	seq_printf(m, " redundant hardirq ons:         %11llu\n", hr1); +	seq_printf(m, " redundant hardirq offs:        %11llu\n", hr2); +	seq_printf(m, " softirq on events:             %11llu\n", si1); +	seq_printf(m, " softirq off events:            %11llu\n", si2); +	seq_printf(m, " redundant softirq ons:         %11llu\n", sr1); +	seq_printf(m, " redundant softirq offs:        %11llu\n", sr2); +#endif +} + +static int lockdep_stats_show(struct seq_file *m, void *v) +{ +	struct lock_class *class; +	unsigned long nr_unused = 0, nr_uncategorized = 0, +		      nr_irq_safe = 0, nr_irq_unsafe = 0, +		      nr_softirq_safe = 0, nr_softirq_unsafe = 0, +		      nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, +		      nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, +		      nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, +		      nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, +		      sum_forward_deps = 0; + +	list_for_each_entry(class, &all_lock_classes, lock_entry) { + +		if (class->usage_mask == 0) +			nr_unused++; +		if (class->usage_mask == LOCKF_USED) +			nr_uncategorized++; +		if (class->usage_mask & LOCKF_USED_IN_IRQ) +			nr_irq_safe++; +		if (class->usage_mask & LOCKF_ENABLED_IRQ) +			nr_irq_unsafe++; +		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) +			nr_softirq_safe++; +		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) +			nr_softirq_unsafe++; +		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) +			nr_hardirq_safe++; +		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) +			nr_hardirq_unsafe++; +		if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) +			nr_irq_read_safe++; +		if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) +			nr_irq_read_unsafe++; +		if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) +			nr_softirq_read_safe++; +		if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) +			nr_softirq_read_unsafe++; +		if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) +			nr_hardirq_read_safe++; +		if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) +			nr_hardirq_read_unsafe++; + +#ifdef CONFIG_PROVE_LOCKING +		sum_forward_deps += lockdep_count_forward_deps(class); +#endif +	} +#ifdef CONFIG_DEBUG_LOCKDEP +	DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); +#endif +	seq_printf(m, " lock-classes:                  %11lu [max: %lu]\n", +			nr_lock_classes, MAX_LOCKDEP_KEYS); +	seq_printf(m, " direct dependencies:           %11lu [max: %lu]\n", +			nr_list_entries, MAX_LOCKDEP_ENTRIES); +	seq_printf(m, " indirect dependencies:         %11lu\n", +			sum_forward_deps); + +	/* +	 * Total number of dependencies: +	 * +	 * All irq-safe locks may nest inside irq-unsafe locks, +	 * plus all the other known dependencies: +	 */ +	seq_printf(m, " all direct dependencies:       %11lu\n", +			nr_irq_unsafe * nr_irq_safe + +			nr_hardirq_unsafe * nr_hardirq_safe + +			nr_list_entries); + +#ifdef CONFIG_PROVE_LOCKING +	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n", +			nr_lock_chains, MAX_LOCKDEP_CHAINS); +	seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n", +			nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +	seq_printf(m, " in-hardirq chains:             %11u\n", +			nr_hardirq_chains); +	seq_printf(m, " in-softirq chains:             %11u\n", +			nr_softirq_chains); +#endif +	seq_printf(m, " in-process chains:             %11u\n", +			nr_process_chains); +	seq_printf(m, " stack-trace entries:           %11lu [max: %lu]\n", +			nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); +	seq_printf(m, " combined max dependencies:     %11u\n", +			(nr_hardirq_chains + 1) * +			(nr_softirq_chains + 1) * +			(nr_process_chains + 1) +	); +	seq_printf(m, " hardirq-safe locks:            %11lu\n", +			nr_hardirq_safe); +	seq_printf(m, " hardirq-unsafe locks:          %11lu\n", +			nr_hardirq_unsafe); +	seq_printf(m, " softirq-safe locks:            %11lu\n", +			nr_softirq_safe); +	seq_printf(m, " softirq-unsafe locks:          %11lu\n", +			nr_softirq_unsafe); +	seq_printf(m, " irq-safe locks:                %11lu\n", +			nr_irq_safe); +	seq_printf(m, " irq-unsafe locks:              %11lu\n", +			nr_irq_unsafe); + +	seq_printf(m, " hardirq-read-safe locks:       %11lu\n", +			nr_hardirq_read_safe); +	seq_printf(m, " hardirq-read-unsafe locks:     %11lu\n", +			nr_hardirq_read_unsafe); +	seq_printf(m, " softirq-read-safe locks:       %11lu\n", +			nr_softirq_read_safe); +	seq_printf(m, " softirq-read-unsafe locks:     %11lu\n", +			nr_softirq_read_unsafe); +	seq_printf(m, " irq-read-safe locks:           %11lu\n", +			nr_irq_read_safe); +	seq_printf(m, " irq-read-unsafe locks:         %11lu\n", +			nr_irq_read_unsafe); + +	seq_printf(m, " uncategorized locks:           %11lu\n", +			nr_uncategorized); +	seq_printf(m, " unused locks:                  %11lu\n", +			nr_unused); +	seq_printf(m, " max locking depth:             %11u\n", +			max_lockdep_depth); +#ifdef CONFIG_PROVE_LOCKING +	seq_printf(m, " max bfs queue depth:           %11u\n", +			max_bfs_queue_depth); +#endif +	lockdep_stats_debug_show(m); +	seq_printf(m, " debug_locks:                   %11u\n", +			debug_locks); + +	return 0; +} + +static int lockdep_stats_open(struct inode *inode, struct file *file) +{ +	return single_open(file, lockdep_stats_show, NULL); +} + +static const struct file_operations proc_lockdep_stats_operations = { +	.open		= lockdep_stats_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +#ifdef CONFIG_LOCK_STAT + +struct lock_stat_data { +	struct lock_class *class; +	struct lock_class_stats stats; +}; + +struct lock_stat_seq { +	struct lock_stat_data *iter_end; +	struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; +}; + +/* + * sort on absolute number of contentions + */ +static int lock_stat_cmp(const void *l, const void *r) +{ +	const struct lock_stat_data *dl = l, *dr = r; +	unsigned long nl, nr; + +	nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; +	nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; + +	return nr - nl; +} + +static void seq_line(struct seq_file *m, char c, int offset, int length) +{ +	int i; + +	for (i = 0; i < offset; i++) +		seq_puts(m, " "); +	for (i = 0; i < length; i++) +		seq_printf(m, "%c", c); +	seq_puts(m, "\n"); +} + +static void snprint_time(char *buf, size_t bufsiz, s64 nr) +{ +	s64 div; +	s32 rem; + +	nr += 5; /* for display rounding */ +	div = div_s64_rem(nr, 1000, &rem); +	snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); +} + +static void seq_time(struct seq_file *m, s64 time) +{ +	char num[15]; + +	snprint_time(num, sizeof(num), time); +	seq_printf(m, " %14s", num); +} + +static void seq_lock_time(struct seq_file *m, struct lock_time *lt) +{ +	seq_printf(m, "%14lu", lt->nr); +	seq_time(m, lt->min); +	seq_time(m, lt->max); +	seq_time(m, lt->total); +	seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); +} + +static void seq_stats(struct seq_file *m, struct lock_stat_data *data) +{ +	char name[39]; +	struct lock_class *class; +	struct lock_class_stats *stats; +	int i, namelen; + +	class = data->class; +	stats = &data->stats; + +	namelen = 38; +	if (class->name_version > 1) +		namelen -= 2; /* XXX truncates versions > 9 */ +	if (class->subclass) +		namelen -= 2; + +	if (!class->name) { +		char str[KSYM_NAME_LEN]; +		const char *key_name; + +		key_name = __get_key_name(class->key, str); +		snprintf(name, namelen, "%s", key_name); +	} else { +		snprintf(name, namelen, "%s", class->name); +	} +	namelen = strlen(name); +	if (class->name_version > 1) { +		snprintf(name+namelen, 3, "#%d", class->name_version); +		namelen += 2; +	} +	if (class->subclass) { +		snprintf(name+namelen, 3, "/%d", class->subclass); +		namelen += 2; +	} + +	if (stats->write_holdtime.nr) { +		if (stats->read_holdtime.nr) +			seq_printf(m, "%38s-W:", name); +		else +			seq_printf(m, "%40s:", name); + +		seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); +		seq_lock_time(m, &stats->write_waittime); +		seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); +		seq_lock_time(m, &stats->write_holdtime); +		seq_puts(m, "\n"); +	} + +	if (stats->read_holdtime.nr) { +		seq_printf(m, "%38s-R:", name); +		seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); +		seq_lock_time(m, &stats->read_waittime); +		seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); +		seq_lock_time(m, &stats->read_holdtime); +		seq_puts(m, "\n"); +	} + +	if (stats->read_waittime.nr + stats->write_waittime.nr == 0) +		return; + +	if (stats->read_holdtime.nr) +		namelen += 2; + +	for (i = 0; i < LOCKSTAT_POINTS; i++) { +		char ip[32]; + +		if (class->contention_point[i] == 0) +			break; + +		if (!i) +			seq_line(m, '-', 40-namelen, namelen); + +		snprintf(ip, sizeof(ip), "[<%p>]", +				(void *)class->contention_point[i]); +		seq_printf(m, "%40s %14lu %29s %pS\n", +			   name, stats->contention_point[i], +			   ip, (void *)class->contention_point[i]); +	} +	for (i = 0; i < LOCKSTAT_POINTS; i++) { +		char ip[32]; + +		if (class->contending_point[i] == 0) +			break; + +		if (!i) +			seq_line(m, '-', 40-namelen, namelen); + +		snprintf(ip, sizeof(ip), "[<%p>]", +				(void *)class->contending_point[i]); +		seq_printf(m, "%40s %14lu %29s %pS\n", +			   name, stats->contending_point[i], +			   ip, (void *)class->contending_point[i]); +	} +	if (i) { +		seq_puts(m, "\n"); +		seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); +		seq_puts(m, "\n"); +	} +} + +static void seq_header(struct seq_file *m) +{ +	seq_puts(m, "lock_stat version 0.4\n"); + +	if (unlikely(!debug_locks)) +		seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); + +	seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); +	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " +			"%14s %14s\n", +			"class name", +			"con-bounces", +			"contentions", +			"waittime-min", +			"waittime-max", +			"waittime-total", +			"waittime-avg", +			"acq-bounces", +			"acquisitions", +			"holdtime-min", +			"holdtime-max", +			"holdtime-total", +			"holdtime-avg"); +	seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); +	seq_printf(m, "\n"); +} + +static void *ls_start(struct seq_file *m, loff_t *pos) +{ +	struct lock_stat_seq *data = m->private; +	struct lock_stat_data *iter; + +	if (*pos == 0) +		return SEQ_START_TOKEN; + +	iter = data->stats + (*pos - 1); +	if (iter >= data->iter_end) +		iter = NULL; + +	return iter; +} + +static void *ls_next(struct seq_file *m, void *v, loff_t *pos) +{ +	(*pos)++; +	return ls_start(m, pos); +} + +static void ls_stop(struct seq_file *m, void *v) +{ +} + +static int ls_show(struct seq_file *m, void *v) +{ +	if (v == SEQ_START_TOKEN) +		seq_header(m); +	else +		seq_stats(m, v); + +	return 0; +} + +static const struct seq_operations lockstat_ops = { +	.start	= ls_start, +	.next	= ls_next, +	.stop	= ls_stop, +	.show	= ls_show, +}; + +static int lock_stat_open(struct inode *inode, struct file *file) +{ +	int res; +	struct lock_class *class; +	struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); + +	if (!data) +		return -ENOMEM; + +	res = seq_open(file, &lockstat_ops); +	if (!res) { +		struct lock_stat_data *iter = data->stats; +		struct seq_file *m = file->private_data; + +		list_for_each_entry(class, &all_lock_classes, lock_entry) { +			iter->class = class; +			iter->stats = lock_stats(class); +			iter++; +		} +		data->iter_end = iter; + +		sort(data->stats, data->iter_end - data->stats, +				sizeof(struct lock_stat_data), +				lock_stat_cmp, NULL); + +		m->private = data; +	} else +		vfree(data); + +	return res; +} + +static ssize_t lock_stat_write(struct file *file, const char __user *buf, +			       size_t count, loff_t *ppos) +{ +	struct lock_class *class; +	char c; + +	if (count) { +		if (get_user(c, buf)) +			return -EFAULT; + +		if (c != '0') +			return count; + +		list_for_each_entry(class, &all_lock_classes, lock_entry) +			clear_lock_stats(class); +	} +	return count; +} + +static int lock_stat_release(struct inode *inode, struct file *file) +{ +	struct seq_file *seq = file->private_data; + +	vfree(seq->private); +	return seq_release(inode, file); +} + +static const struct file_operations proc_lock_stat_operations = { +	.open		= lock_stat_open, +	.write		= lock_stat_write, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= lock_stat_release, +}; +#endif /* CONFIG_LOCK_STAT */ + +static int __init lockdep_proc_init(void) +{ +	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); +#ifdef CONFIG_PROVE_LOCKING +	proc_create("lockdep_chains", S_IRUSR, NULL, +		    &proc_lockdep_chains_operations); +#endif +	proc_create("lockdep_stats", S_IRUSR, NULL, +		    &proc_lockdep_stats_operations); + +#ifdef CONFIG_LOCK_STAT +	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, +		    &proc_lock_stat_operations); +#endif + +	return 0; +} + +__initcall(lockdep_proc_init); + diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h new file mode 100644 index 00000000000..995b0cc2b84 --- /dev/null +++ b/kernel/locking/lockdep_states.h @@ -0,0 +1,9 @@ +/* + * Lockdep states, + * + * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever + * you add one, or come up with a nice dynamic solution. + */ +LOCKDEP_STATE(HARDIRQ) +LOCKDEP_STATE(SOFTIRQ) +LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c new file mode 100644 index 00000000000..0955b885d0d --- /dev/null +++ b/kernel/locking/locktorture.c @@ -0,0 +1,454 @@ +/* + * Module-based torture test facility for locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney <paulmck@us.ibm.com> + *	Based on kernel/rcu/torture.c. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/trace_clock.h> +#include <asm/byteorder.h> +#include <linux/torture.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); + +torture_param(int, nwriters_stress, -1, +	     "Number of write-locking stress-test threads"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, +	     "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, +	     "Number of jiffies between shuffles, 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); +torture_param(int, stat_interval, 60, +	     "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(bool, verbose, true, +	     "Enable verbose debugging printk()s"); + +static char *torture_type = "spin_lock"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, +		 "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); + +static atomic_t n_lock_torture_errors; + +static struct task_struct *stats_task; +static struct task_struct **writer_tasks; + +static int nrealwriters_stress; +static bool lock_is_write_held; + +struct lock_writer_stress_stats { +	long n_write_lock_fail; +	long n_write_lock_acquired; +}; +static struct lock_writer_stress_stats *lwsa; + +#if defined(MODULE) +#define LOCKTORTURE_RUNNABLE_INIT 1 +#else +#define LOCKTORTURE_RUNNABLE_INIT 0 +#endif +int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(locktorture_runnable, int, 0444); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); + +/* Forward reference. */ +static void lock_torture_cleanup(void); + +/* + * Operations vector for selecting different types of tests. + */ +struct lock_torture_ops { +	void (*init)(void); +	int (*writelock)(void); +	void (*write_delay)(struct torture_random_state *trsp); +	void (*writeunlock)(void); +	unsigned long flags; +	const char *name; +}; + +static struct lock_torture_ops *cur_ops; + +/* + * Definitions for lock torture testing. + */ + +static int torture_lock_busted_write_lock(void) +{ +	return 0;  /* BUGGY, do not use in real life!!! */ +} + +static void torture_lock_busted_write_delay(struct torture_random_state *trsp) +{ +	const unsigned long longdelay_us = 100; + +	/* We want a long delay occasionally to force massive contention.  */ +	if (!(torture_random(trsp) % +	      (nrealwriters_stress * 2000 * longdelay_us))) +		mdelay(longdelay_us); +#ifdef CONFIG_PREEMPT +	if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) +		preempt_schedule();  /* Allow test to be preempted. */ +#endif +} + +static void torture_lock_busted_write_unlock(void) +{ +	  /* BUGGY, do not use in real life!!! */ +} + +static struct lock_torture_ops lock_busted_ops = { +	.writelock	= torture_lock_busted_write_lock, +	.write_delay	= torture_lock_busted_write_delay, +	.writeunlock	= torture_lock_busted_write_unlock, +	.name		= "lock_busted" +}; + +static DEFINE_SPINLOCK(torture_spinlock); + +static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +{ +	spin_lock(&torture_spinlock); +	return 0; +} + +static void torture_spin_lock_write_delay(struct torture_random_state *trsp) +{ +	const unsigned long shortdelay_us = 2; +	const unsigned long longdelay_us = 100; + +	/* We want a short delay mostly to emulate likely code, and +	 * we want a long delay occasionally to force massive contention. +	 */ +	if (!(torture_random(trsp) % +	      (nrealwriters_stress * 2000 * longdelay_us))) +		mdelay(longdelay_us); +	if (!(torture_random(trsp) % +	      (nrealwriters_stress * 2 * shortdelay_us))) +		udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT +	if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) +		preempt_schedule();  /* Allow test to be preempted. */ +#endif +} + +static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +{ +	spin_unlock(&torture_spinlock); +} + +static struct lock_torture_ops spin_lock_ops = { +	.writelock	= torture_spin_lock_write_lock, +	.write_delay	= torture_spin_lock_write_delay, +	.writeunlock	= torture_spin_lock_write_unlock, +	.name		= "spin_lock" +}; + +static int torture_spin_lock_write_lock_irq(void) +__acquires(torture_spinlock_irq) +{ +	unsigned long flags; + +	spin_lock_irqsave(&torture_spinlock, flags); +	cur_ops->flags = flags; +	return 0; +} + +static void torture_lock_spin_write_unlock_irq(void) +__releases(torture_spinlock) +{ +	spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); +} + +static struct lock_torture_ops spin_lock_irq_ops = { +	.writelock	= torture_spin_lock_write_lock_irq, +	.write_delay	= torture_spin_lock_write_delay, +	.writeunlock	= torture_lock_spin_write_unlock_irq, +	.name		= "spin_lock_irq" +}; + +/* + * Lock torture writer kthread.  Repeatedly acquires and releases + * the lock, checking for duplicate acquisitions. + */ +static int lock_torture_writer(void *arg) +{ +	struct lock_writer_stress_stats *lwsp = arg; +	static DEFINE_TORTURE_RANDOM(rand); + +	VERBOSE_TOROUT_STRING("lock_torture_writer task started"); +	set_user_nice(current, MAX_NICE); + +	do { +		if ((torture_random(&rand) & 0xfffff) == 0) +			schedule_timeout_uninterruptible(1); +		cur_ops->writelock(); +		if (WARN_ON_ONCE(lock_is_write_held)) +			lwsp->n_write_lock_fail++; +		lock_is_write_held = 1; +		lwsp->n_write_lock_acquired++; +		cur_ops->write_delay(&rand); +		lock_is_write_held = 0; +		cur_ops->writeunlock(); +		stutter_wait("lock_torture_writer"); +	} while (!torture_must_stop()); +	torture_kthread_stopping("lock_torture_writer"); +	return 0; +} + +/* + * Create an lock-torture-statistics message in the specified buffer. + */ +static void lock_torture_printk(char *page) +{ +	bool fail = 0; +	int i; +	long max = 0; +	long min = lwsa[0].n_write_lock_acquired; +	long long sum = 0; + +	for (i = 0; i < nrealwriters_stress; i++) { +		if (lwsa[i].n_write_lock_fail) +			fail = true; +		sum += lwsa[i].n_write_lock_acquired; +		if (max < lwsa[i].n_write_lock_fail) +			max = lwsa[i].n_write_lock_fail; +		if (min > lwsa[i].n_write_lock_fail) +			min = lwsa[i].n_write_lock_fail; +	} +	page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); +	page += sprintf(page, +			"Writes:  Total: %lld  Max/Min: %ld/%ld %s  Fail: %d %s\n", +			sum, max, min, max / 2 > min ? "???" : "", +			fail, fail ? "!!!" : ""); +	if (fail) +		atomic_inc(&n_lock_torture_errors); +} + +/* + * Print torture statistics.  Caller must ensure that there is only one + * call to this function at a given time!!!  This is normally accomplished + * by relying on the module system to only have one copy of the module + * loaded, and then by giving the lock_torture_stats kthread full control + * (or the init/cleanup functions when lock_torture_stats thread is not + * running). + */ +static void lock_torture_stats_print(void) +{ +	int size = nrealwriters_stress * 200 + 8192; +	char *buf; + +	buf = kmalloc(size, GFP_KERNEL); +	if (!buf) { +		pr_err("lock_torture_stats_print: Out of memory, need: %d", +		       size); +		return; +	} +	lock_torture_printk(buf); +	pr_alert("%s", buf); +	kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int lock_torture_stats(void *arg) +{ +	VERBOSE_TOROUT_STRING("lock_torture_stats task started"); +	do { +		schedule_timeout_interruptible(stat_interval * HZ); +		lock_torture_stats_print(); +		torture_shutdown_absorb("lock_torture_stats"); +	} while (!torture_must_stop()); +	torture_kthread_stopping("lock_torture_stats"); +	return 0; +} + +static inline void +lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, +				const char *tag) +{ +	pr_alert("%s" TORTURE_FLAG +		 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", +		 torture_type, tag, nrealwriters_stress, stat_interval, verbose, +		 shuffle_interval, stutter, shutdown_secs, +		 onoff_interval, onoff_holdoff); +} + +static void lock_torture_cleanup(void) +{ +	int i; + +	if (torture_cleanup()) +		return; + +	if (writer_tasks) { +		for (i = 0; i < nrealwriters_stress; i++) +			torture_stop_kthread(lock_torture_writer, +					     writer_tasks[i]); +		kfree(writer_tasks); +		writer_tasks = NULL; +	} + +	torture_stop_kthread(lock_torture_stats, stats_task); +	lock_torture_stats_print();  /* -After- the stats thread is stopped! */ + +	if (atomic_read(&n_lock_torture_errors)) +		lock_torture_print_module_parms(cur_ops, +						"End of test: FAILURE"); +	else if (torture_onoff_failures()) +		lock_torture_print_module_parms(cur_ops, +						"End of test: LOCK_HOTPLUG"); +	else +		lock_torture_print_module_parms(cur_ops, +						"End of test: SUCCESS"); +} + +static int __init lock_torture_init(void) +{ +	int i; +	int firsterr = 0; +	static struct lock_torture_ops *torture_ops[] = { +		&lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, +	}; + +	if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) +		return -EBUSY; + +	/* Process args and tell the world that the torturer is on the job. */ +	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { +		cur_ops = torture_ops[i]; +		if (strcmp(torture_type, cur_ops->name) == 0) +			break; +	} +	if (i == ARRAY_SIZE(torture_ops)) { +		pr_alert("lock-torture: invalid torture type: \"%s\"\n", +			 torture_type); +		pr_alert("lock-torture types:"); +		for (i = 0; i < ARRAY_SIZE(torture_ops); i++) +			pr_alert(" %s", torture_ops[i]->name); +		pr_alert("\n"); +		torture_init_end(); +		return -EINVAL; +	} +	if (cur_ops->init) +		cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + +	if (nwriters_stress >= 0) +		nrealwriters_stress = nwriters_stress; +	else +		nrealwriters_stress = 2 * num_online_cpus(); +	lock_torture_print_module_parms(cur_ops, "Start of test"); + +	/* Initialize the statistics so that each run gets its own numbers. */ + +	lock_is_write_held = 0; +	lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); +	if (lwsa == NULL) { +		VERBOSE_TOROUT_STRING("lwsa: Out of memory"); +		firsterr = -ENOMEM; +		goto unwind; +	} +	for (i = 0; i < nrealwriters_stress; i++) { +		lwsa[i].n_write_lock_fail = 0; +		lwsa[i].n_write_lock_acquired = 0; +	} + +	/* Start up the kthreads. */ + +	if (onoff_interval > 0) { +		firsterr = torture_onoff_init(onoff_holdoff * HZ, +					      onoff_interval * HZ); +		if (firsterr) +			goto unwind; +	} +	if (shuffle_interval > 0) { +		firsterr = torture_shuffle_init(shuffle_interval); +		if (firsterr) +			goto unwind; +	} +	if (shutdown_secs > 0) { +		firsterr = torture_shutdown_init(shutdown_secs, +						 lock_torture_cleanup); +		if (firsterr) +			goto unwind; +	} +	if (stutter > 0) { +		firsterr = torture_stutter_init(stutter); +		if (firsterr) +			goto unwind; +	} + +	writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), +			       GFP_KERNEL); +	if (writer_tasks == NULL) { +		VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); +		firsterr = -ENOMEM; +		goto unwind; +	} +	for (i = 0; i < nrealwriters_stress; i++) { +		firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], +						  writer_tasks[i]); +		if (firsterr) +			goto unwind; +	} +	if (stat_interval > 0) { +		firsterr = torture_create_kthread(lock_torture_stats, NULL, +						  stats_task); +		if (firsterr) +			goto unwind; +	} +	torture_init_end(); +	return 0; + +unwind: +	torture_init_end(); +	lock_torture_cleanup(); +	return firsterr; +} + +module_init(lock_torture_init); +module_exit(lock_torture_cleanup); diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 00000000000..be9ee1559fc --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,210 @@ + +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); + +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ +	return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ +	int cpu_nr = encoded_cpu_val - 1; + +	return per_cpu_ptr(&osq_node, cpu_nr); +} + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_queue *lock, +	      struct optimistic_spin_node *node, +	      struct optimistic_spin_node *prev) +{ +	struct optimistic_spin_node *next = NULL; +	int curr = encode_cpu(smp_processor_id()); +	int old; + +	/* +	 * If there is a prev node in queue, then the 'old' value will be +	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if +	 * we're currently last in queue, then the queue will then become empty. +	 */ +	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; + +	for (;;) { +		if (atomic_read(&lock->tail) == curr && +		    atomic_cmpxchg(&lock->tail, curr, old) == curr) { +			/* +			 * We were the last queued, we moved @lock back. @prev +			 * will now observe @lock and will complete its +			 * unlock()/unqueue(). +			 */ +			break; +		} + +		/* +		 * We must xchg() the @node->next value, because if we were to +		 * leave it in, a concurrent unlock()/unqueue() from +		 * @node->next might complete Step-A and think its @prev is +		 * still valid. +		 * +		 * If the concurrent unlock()/unqueue() wins the race, we'll +		 * wait for either @lock to point to us, through its Step-B, or +		 * wait for a new @node->next from its Step-C. +		 */ +		if (node->next) { +			next = xchg(&node->next, NULL); +			if (next) +				break; +		} + +		arch_mutex_cpu_relax(); +	} + +	return next; +} + +bool osq_lock(struct optimistic_spin_queue *lock) +{ +	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); +	struct optimistic_spin_node *prev, *next; +	int curr = encode_cpu(smp_processor_id()); +	int old; + +	node->locked = 0; +	node->next = NULL; +	node->cpu = curr; + +	old = atomic_xchg(&lock->tail, curr); +	if (old == OSQ_UNLOCKED_VAL) +		return true; + +	prev = decode_cpu(old); +	node->prev = prev; +	ACCESS_ONCE(prev->next) = node; + +	/* +	 * Normally @prev is untouchable after the above store; because at that +	 * moment unlock can proceed and wipe the node element from stack. +	 * +	 * However, since our nodes are static per-cpu storage, we're +	 * guaranteed their existence -- this allows us to apply +	 * cmpxchg in an attempt to undo our queueing. +	 */ + +	while (!smp_load_acquire(&node->locked)) { +		/* +		 * If we need to reschedule bail... so we can block. +		 */ +		if (need_resched()) +			goto unqueue; + +		arch_mutex_cpu_relax(); +	} +	return true; + +unqueue: +	/* +	 * Step - A  -- stabilize @prev +	 * +	 * Undo our @prev->next assignment; this will make @prev's +	 * unlock()/unqueue() wait for a next pointer since @lock points to us +	 * (or later). +	 */ + +	for (;;) { +		if (prev->next == node && +		    cmpxchg(&prev->next, node, NULL) == node) +			break; + +		/* +		 * We can only fail the cmpxchg() racing against an unlock(), +		 * in which case we should observe @node->locked becomming +		 * true. +		 */ +		if (smp_load_acquire(&node->locked)) +			return true; + +		arch_mutex_cpu_relax(); + +		/* +		 * Or we race against a concurrent unqueue()'s step-B, in which +		 * case its step-C will write us a new @node->prev pointer. +		 */ +		prev = ACCESS_ONCE(node->prev); +	} + +	/* +	 * Step - B -- stabilize @next +	 * +	 * Similar to unlock(), wait for @node->next or move @lock from @node +	 * back to @prev. +	 */ + +	next = osq_wait_next(lock, node, prev); +	if (!next) +		return false; + +	/* +	 * Step - C -- unlink +	 * +	 * @prev is stable because its still waiting for a new @prev->next +	 * pointer, @next is stable because our @node->next pointer is NULL and +	 * it will wait in Step-A. +	 */ + +	ACCESS_ONCE(next->prev) = prev; +	ACCESS_ONCE(prev->next) = next; + +	return false; +} + +void osq_unlock(struct optimistic_spin_queue *lock) +{ +	struct optimistic_spin_node *node, *next; +	int curr = encode_cpu(smp_processor_id()); + +	/* +	 * Fast path for the uncontended case. +	 */ +	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) +		return; + +	/* +	 * Second most likely case. +	 */ +	node = this_cpu_ptr(&osq_node); +	next = xchg(&node->next, NULL); +	if (next) { +		ACCESS_ONCE(next->locked) = 1; +		return; +	} + +	next = osq_wait_next(lock, node, NULL); +	if (next) +		ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 00000000000..74356dc0ce2 --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,130 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include <asm/mcs_spinlock.h> + +struct mcs_spinlock { +	struct mcs_spinlock *next; +	int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l)					\ +do {									\ +	while (!(smp_load_acquire(l)))					\ +		arch_mutex_cpu_relax();					\ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l)				\ +	smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ +	struct mcs_spinlock *prev; + +	/* Init node */ +	node->locked = 0; +	node->next   = NULL; + +	prev = xchg(lock, node); +	if (likely(prev == NULL)) { +		/* +		 * Lock acquired, don't need to set node->locked to 1. Threads +		 * only spin on its own node->locked value for lock acquisition. +		 * However, since this thread can immediately acquire the lock +		 * and does not proceed to spin on its own node->locked, this +		 * value won't be used. If a debug mode is needed to +		 * audit lock status, then set node->locked value here. +		 */ +		return; +	} +	ACCESS_ONCE(prev->next) = node; + +	/* Wait until the lock holder passes the lock down. */ +	arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ +	struct mcs_spinlock *next = ACCESS_ONCE(node->next); + +	if (likely(!next)) { +		/* +		 * Release the lock by setting it to NULL +		 */ +		if (likely(cmpxchg(lock, node, NULL) == node)) +			return; +		/* Wait until the next pointer is set */ +		while (!(next = ACCESS_ONCE(node->next))) +			arch_mutex_cpu_relax(); +	} + +	/* Pass lock to next waiter. */ +	arch_mcs_spin_unlock_contended(&next->locked); +} + +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_node { +	struct optimistic_spin_node *next, *prev; +	int locked; /* 1 if lock acquired */ +	int cpu; /* encoded CPU # value */ +}; + +extern bool osq_lock(struct optimistic_spin_queue *lock); +extern void osq_unlock(struct optimistic_spin_queue *lock); + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c new file mode 100644 index 00000000000..5cf6731b98e --- /dev/null +++ b/kernel/locking/mutex-debug.c @@ -0,0 +1,120 @@ +/* + * kernel/mutex-debug.c + * + * Debugging code for mutexes + * + * Started by Ingo Molnar: + * + *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * lock debugging, locking tree, deadlock detection started by: + * + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + *  Released under the General Public License (GPL). + */ +#include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> + +#include "mutex-debug.h" + +/* + * Must be called with lock->wait_lock held. + */ +void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ +	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); +	waiter->magic = waiter; +	INIT_LIST_HEAD(&waiter->list); +} + +void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ +	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); +	DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); +	DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); +	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); +} + +void debug_mutex_free_waiter(struct mutex_waiter *waiter) +{ +	DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); +	memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); +} + +void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, +			    struct thread_info *ti) +{ +	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + +	/* Mark the current thread as blocked on the lock: */ +	ti->task->blocked_on = waiter; +} + +void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +			 struct thread_info *ti) +{ +	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); +	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); +	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); +	ti->task->blocked_on = NULL; + +	list_del_init(&waiter->list); +	waiter->task = NULL; +} + +void debug_mutex_unlock(struct mutex *lock) +{ +	if (likely(debug_locks)) { +		DEBUG_LOCKS_WARN_ON(lock->magic != lock); + +		if (!lock->owner) +			DEBUG_LOCKS_WARN_ON(!lock->owner); +		else +			DEBUG_LOCKS_WARN_ON(lock->owner != current); + +		DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); +		mutex_clear_owner(lock); +	} + +	/* +	 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug +	 * mutexes so that we can do it here after we've verified state. +	 */ +	atomic_set(&lock->count, 1); +} + +void debug_mutex_init(struct mutex *lock, const char *name, +		      struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held lock: +	 */ +	debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +	lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +	lock->magic = lock; +} + +/*** + * mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void mutex_destroy(struct mutex *lock) +{ +	DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); +	lock->magic = NULL; +} + +EXPORT_SYMBOL_GPL(mutex_destroy); diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h new file mode 100644 index 00000000000..0799fd3e4cf --- /dev/null +++ b/kernel/locking/mutex-debug.h @@ -0,0 +1,55 @@ +/* + * Mutexes: blocking mutual exclusion locks + * + * started by Ingo Molnar: + * + *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * This file contains mutex debugging related internal declarations, + * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. + * More details are in kernel/mutex-debug.c. + */ + +/* + * This must be called with lock->wait_lock held. + */ +extern void debug_mutex_lock_common(struct mutex *lock, +				    struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, +				    struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, +				   struct mutex_waiter *waiter, +				   struct thread_info *ti); +extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +				struct thread_info *ti); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock, const char *name, +			     struct lock_class_key *key); + +static inline void mutex_set_owner(struct mutex *lock) +{ +	lock->owner = current; +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +	lock->owner = NULL; +} + +#define spin_lock_mutex(lock, flags)			\ +	do {						\ +		struct mutex *l = container_of(lock, struct mutex, wait_lock); \ +							\ +		DEBUG_LOCKS_WARN_ON(in_interrupt());	\ +		local_irq_save(flags);			\ +		arch_spin_lock(&(lock)->rlock.raw_lock);\ +		DEBUG_LOCKS_WARN_ON(l->magic != l);	\ +	} while (0) + +#define spin_unlock_mutex(lock, flags)				\ +	do {							\ +		arch_spin_unlock(&(lock)->rlock.raw_lock);	\ +		local_irq_restore(flags);			\ +		preempt_check_resched();			\ +	} while (0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c new file mode 100644 index 00000000000..acca2c1a3c5 --- /dev/null +++ b/kernel/locking/mutex.c @@ -0,0 +1,930 @@ +/* + * kernel/locking/mutex.c + * + * Mutexes: blocking mutual exclusion locks + * + * Started by Ingo Molnar: + * + *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and + * David Howells for suggestions and improvements. + * + *  - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline + *    from the -rt tree, where it was originally implemented for rtmutexes + *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale + *    and Sven Dietrich. + * + * Also see Documentation/mutex-design.txt. + */ +#include <linux/mutex.h> +#include <linux/ww_mutex.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/export.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include "mcs_spinlock.h" + +/* + * In the DEBUG case we are using the "NULL fastpath" for mutexes, + * which forces all calls into the slowpath: + */ +#ifdef CONFIG_DEBUG_MUTEXES +# include "mutex-debug.h" +# include <asm-generic/mutex-null.h> +/* + * Must be 0 for the debug case so we do not do the unlock outside of the + * wait_lock region. debug_mutex_unlock() will do the actual unlock in this + * case. + */ +# undef __mutex_slowpath_needs_to_unlock +# define  __mutex_slowpath_needs_to_unlock()	0 +#else +# include "mutex.h" +# include <asm/mutex.h> +#endif + +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define	MUTEX_SHOW_NO_WAITER(mutex)	(atomic_read(&(mutex)->count) >= 0) + +void +__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +{ +	atomic_set(&lock->count, 1); +	spin_lock_init(&lock->wait_lock); +	INIT_LIST_HEAD(&lock->wait_list); +	mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +	osq_lock_init(&lock->osq); +#endif + +	debug_mutex_init(lock, name, key); +} + +EXPORT_SYMBOL(__mutex_init); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * We split the mutex lock/unlock logic into separate fastpath and + * slowpath functions, to reduce the register pressure on the fastpath. + * We also put the fastpath first in the kernel image, to make sure the + * branch is predicted by the CPU as default-untaken. + */ +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); + +/** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired + * + * Lock the mutex exclusively for this task. If the mutex is not + * available right now, it will sleep until it can get it. + * + * The mutex must later on be released by the same task that + * acquired it. Recursive locking is not allowed. The task + * may not exit without first unlocking the mutex. Also, kernel + * memory where the mutex resides mutex must not be freed with + * the mutex still locked. The mutex must first be initialized + * (or statically defined) before it can be locked. memset()-ing + * the mutex to 0 is not allowed. + * + * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging + *   checks that will enforce the restrictions and will also do + *   deadlock debugging. ) + * + * This function is similar to (but not equivalent to) down(). + */ +void __sched mutex_lock(struct mutex *lock) +{ +	might_sleep(); +	/* +	 * The locking fastpath is the 1->0 transition from +	 * 'unlocked' into 'locked' state. +	 */ +	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); +	mutex_set_owner(lock); +} + +EXPORT_SYMBOL(mutex_lock); +#endif + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + */ + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ +	if (lock->owner != owner) +		return false; + +	/* +	 * Ensure we emit the owner->on_cpu, dereference _after_ checking +	 * lock->owner still matches owner, if that fails, owner might +	 * point to free()d memory, if it still matches, the rcu_read_lock() +	 * ensures the memory stays valid. +	 */ +	barrier(); + +	return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ +	rcu_read_lock(); +	while (owner_running(lock, owner)) { +		if (need_resched()) +			break; + +		arch_mutex_cpu_relax(); +	} +	rcu_read_unlock(); + +	/* +	 * We break out the loop above on need_resched() and when the +	 * owner changed, which is a sign for heavy contention. Return +	 * success only when lock->owner is NULL. +	 */ +	return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ +	struct task_struct *owner; +	int retval = 1; + +	if (need_resched()) +		return 0; + +	rcu_read_lock(); +	owner = ACCESS_ONCE(lock->owner); +	if (owner) +		retval = owner->on_cpu; +	rcu_read_unlock(); +	/* +	 * if lock->owner is not set, the mutex owner may have just acquired +	 * it and not set the owner yet or the mutex has been released. +	 */ +	return retval; +} +#endif + +__visible __used noinline +void __sched __mutex_unlock_slowpath(atomic_t *lock_count); + +/** + * mutex_unlock - release the mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously. + * + * This function must not be used in interrupt context. Unlocking + * of a not locked mutex is not allowed. + * + * This function is similar to (but not equivalent to) up(). + */ +void __sched mutex_unlock(struct mutex *lock) +{ +	/* +	 * The unlocking fastpath is the 0->1 transition from 'locked' +	 * into 'unlocked' state: +	 */ +#ifndef CONFIG_DEBUG_MUTEXES +	/* +	 * When debugging is enabled we must not clear the owner before time, +	 * the slow path will always be taken, and that clears the owner field +	 * after verifying that it was indeed current. +	 */ +	mutex_clear_owner(lock); +#endif +	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); +} + +EXPORT_SYMBOL(mutex_unlock); + +/** + * ww_mutex_unlock - release the w/w mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously with any of the + * ww_mutex_lock* functions (with or without an acquire context). It is + * forbidden to release the locks after releasing the acquire context. + * + * This function must not be used in interrupt context. Unlocking + * of a unlocked mutex is not allowed. + */ +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ +	/* +	 * The unlocking fastpath is the 0->1 transition from 'locked' +	 * into 'unlocked' state: +	 */ +	if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES +		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif +		if (lock->ctx->acquired > 0) +			lock->ctx->acquired--; +		lock->ctx = NULL; +	} + +#ifndef CONFIG_DEBUG_MUTEXES +	/* +	 * When debugging is enabled we must not clear the owner before time, +	 * the slow path will always be taken, and that clears the owner field +	 * after verifying that it was indeed current. +	 */ +	mutex_clear_owner(&lock->base); +#endif +	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +static inline int __sched +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +{ +	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); +	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + +	if (!hold_ctx) +		return 0; + +	if (unlikely(ctx == hold_ctx)) +		return -EALREADY; + +	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && +	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES +		DEBUG_LOCKS_WARN_ON(ctx->contending_lock); +		ctx->contending_lock = ww; +#endif +		return -EDEADLK; +	} + +	return 0; +} + +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, +						   struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES +	/* +	 * If this WARN_ON triggers, you used ww_mutex_lock to acquire, +	 * but released with a normal mutex_unlock in this call. +	 * +	 * This should never happen, always use ww_mutex_unlock. +	 */ +	DEBUG_LOCKS_WARN_ON(ww->ctx); + +	/* +	 * Not quite done after calling ww_acquire_done() ? +	 */ +	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + +	if (ww_ctx->contending_lock) { +		/* +		 * After -EDEADLK you tried to +		 * acquire a different ww_mutex? Bad! +		 */ +		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + +		/* +		 * You called ww_mutex_lock after receiving -EDEADLK, +		 * but 'forgot' to unlock everything else first? +		 */ +		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); +		ww_ctx->contending_lock = NULL; +	} + +	/* +	 * Naughty, using a different class will lead to undefined behavior! +	 */ +	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif +	ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, +			       struct ww_acquire_ctx *ctx) +{ +	unsigned long flags; +	struct mutex_waiter *cur; + +	ww_mutex_lock_acquired(lock, ctx); + +	lock->ctx = ctx; + +	/* +	 * The lock->ctx update should be visible on all cores before +	 * the atomic read is done, otherwise contended waiters might be +	 * missed. The contended waiters will either see ww_ctx == NULL +	 * and keep spinning, or it will acquire wait_lock, add itself +	 * to waiter list and sleep. +	 */ +	smp_mb(); /* ^^^ */ + +	/* +	 * Check if lock is contended, if not there is nobody to wake up +	 */ +	if (likely(atomic_read(&lock->base.count) == 0)) +		return; + +	/* +	 * Uh oh, we raced in fastpath, wake up everyone in this case, +	 * so they can see the new lock->ctx. +	 */ +	spin_lock_mutex(&lock->base.wait_lock, flags); +	list_for_each_entry(cur, &lock->base.wait_list, list) { +		debug_mutex_wake_waiter(&lock->base, cur); +		wake_up_process(cur->task); +	} +	spin_unlock_mutex(&lock->base.wait_lock, flags); +} + +/* + * Lock a mutex (possibly interruptible), slowpath: + */ +static __always_inline int __sched +__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, +		    struct lockdep_map *nest_lock, unsigned long ip, +		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ +	struct task_struct *task = current; +	struct mutex_waiter waiter; +	unsigned long flags; +	int ret; + +	preempt_disable(); +	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +	/* +	 * Optimistic spinning. +	 * +	 * We try to spin for acquisition when we find that there are no +	 * pending waiters and the lock owner is currently running on a +	 * (different) CPU. +	 * +	 * The rationale is that if the lock owner is running, it is likely to +	 * release the lock soon. +	 * +	 * Since this needs the lock owner, and this mutex implementation +	 * doesn't track the owner atomically in the lock field, we need to +	 * track it non-atomically. +	 * +	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock +	 * to serialize everything. +	 * +	 * The mutex spinners are queued up using MCS lock so that only one +	 * spinner can compete for the mutex. However, if mutex spinning isn't +	 * going to happen, there is no point in going through the lock/unlock +	 * overhead. +	 */ +	if (!mutex_can_spin_on_owner(lock)) +		goto slowpath; + +	if (!osq_lock(&lock->osq)) +		goto slowpath; + +	for (;;) { +		struct task_struct *owner; + +		if (use_ww_ctx && ww_ctx->acquired > 0) { +			struct ww_mutex *ww; + +			ww = container_of(lock, struct ww_mutex, base); +			/* +			 * If ww->ctx is set the contents are undefined, only +			 * by acquiring wait_lock there is a guarantee that +			 * they are not invalid when reading. +			 * +			 * As such, when deadlock detection needs to be +			 * performed the optimistic spinning cannot be done. +			 */ +			if (ACCESS_ONCE(ww->ctx)) +				break; +		} + +		/* +		 * If there's an owner, wait for it to either +		 * release the lock or go to sleep. +		 */ +		owner = ACCESS_ONCE(lock->owner); +		if (owner && !mutex_spin_on_owner(lock, owner)) +			break; + +		if ((atomic_read(&lock->count) == 1) && +		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { +			lock_acquired(&lock->dep_map, ip); +			if (use_ww_ctx) { +				struct ww_mutex *ww; +				ww = container_of(lock, struct ww_mutex, base); + +				ww_mutex_set_context_fastpath(ww, ww_ctx); +			} + +			mutex_set_owner(lock); +			osq_unlock(&lock->osq); +			preempt_enable(); +			return 0; +		} + +		/* +		 * When there's no owner, we might have preempted between the +		 * owner acquiring the lock and setting the owner field. If +		 * we're an RT task that will live-lock because we won't let +		 * the owner complete. +		 */ +		if (!owner && (need_resched() || rt_task(task))) +			break; + +		/* +		 * The cpu_relax() call is a compiler barrier which forces +		 * everything in this loop to be re-loaded. We don't need +		 * memory barriers as we'll eventually observe the right +		 * values at the cost of a few extra spins. +		 */ +		arch_mutex_cpu_relax(); +	} +	osq_unlock(&lock->osq); +slowpath: +	/* +	 * If we fell out of the spin path because of need_resched(), +	 * reschedule now, before we try-lock the mutex. This avoids getting +	 * scheduled out right after we obtained the mutex. +	 */ +	if (need_resched()) +		schedule_preempt_disabled(); +#endif +	spin_lock_mutex(&lock->wait_lock, flags); + +	/* once more, can we acquire the lock? */ +	if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) +		goto skip_wait; + +	debug_mutex_lock_common(lock, &waiter); +	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + +	/* add waiting tasks to the end of the waitqueue (FIFO): */ +	list_add_tail(&waiter.list, &lock->wait_list); +	waiter.task = task; + +	lock_contended(&lock->dep_map, ip); + +	for (;;) { +		/* +		 * Lets try to take the lock again - this is needed even if +		 * we get here for the first time (shortly after failing to +		 * acquire the lock), to make sure that we get a wakeup once +		 * it's unlocked. Later on, if we sleep, this is the +		 * operation that gives us the lock. We xchg it to -1, so +		 * that when we release the lock, we properly wake up the +		 * other waiters: +		 */ +		if (MUTEX_SHOW_NO_WAITER(lock) && +		    (atomic_xchg(&lock->count, -1) == 1)) +			break; + +		/* +		 * got a signal? (This code gets eliminated in the +		 * TASK_UNINTERRUPTIBLE case.) +		 */ +		if (unlikely(signal_pending_state(state, task))) { +			ret = -EINTR; +			goto err; +		} + +		if (use_ww_ctx && ww_ctx->acquired > 0) { +			ret = __mutex_lock_check_stamp(lock, ww_ctx); +			if (ret) +				goto err; +		} + +		__set_task_state(task, state); + +		/* didn't get the lock, go to sleep: */ +		spin_unlock_mutex(&lock->wait_lock, flags); +		schedule_preempt_disabled(); +		spin_lock_mutex(&lock->wait_lock, flags); +	} +	mutex_remove_waiter(lock, &waiter, current_thread_info()); +	/* set it to 0 if there are no waiters left: */ +	if (likely(list_empty(&lock->wait_list))) +		atomic_set(&lock->count, 0); +	debug_mutex_free_waiter(&waiter); + +skip_wait: +	/* got the lock - cleanup and rejoice! */ +	lock_acquired(&lock->dep_map, ip); +	mutex_set_owner(lock); + +	if (use_ww_ctx) { +		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); +		struct mutex_waiter *cur; + +		/* +		 * This branch gets optimized out for the common case, +		 * and is only important for ww_mutex_lock. +		 */ +		ww_mutex_lock_acquired(ww, ww_ctx); +		ww->ctx = ww_ctx; + +		/* +		 * Give any possible sleeping processes the chance to wake up, +		 * so they can recheck if they have to back off. +		 */ +		list_for_each_entry(cur, &lock->wait_list, list) { +			debug_mutex_wake_waiter(lock, cur); +			wake_up_process(cur->task); +		} +	} + +	spin_unlock_mutex(&lock->wait_lock, flags); +	preempt_enable(); +	return 0; + +err: +	mutex_remove_waiter(lock, &waiter, task_thread_info(task)); +	spin_unlock_mutex(&lock->wait_lock, flags); +	debug_mutex_free_waiter(&waiter); +	mutex_release(&lock->dep_map, 1, ip); +	preempt_enable(); +	return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched +mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ +	might_sleep(); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, +			    subclass, NULL, _RET_IP_, NULL, 0); +} + +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ +	might_sleep(); +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, +			    0, nest, _RET_IP_, NULL, 0); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched +mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +{ +	might_sleep(); +	return __mutex_lock_common(lock, TASK_KILLABLE, +				   subclass, NULL, _RET_IP_, NULL, 0); +} +EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ +	might_sleep(); +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, +				   subclass, NULL, _RET_IP_, NULL, 0); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH +	unsigned tmp; + +	if (ctx->deadlock_inject_countdown-- == 0) { +		tmp = ctx->deadlock_inject_interval; +		if (tmp > UINT_MAX/4) +			tmp = UINT_MAX; +		else +			tmp = tmp*2 + tmp + tmp/2; + +		ctx->deadlock_inject_interval = tmp; +		ctx->deadlock_inject_countdown = tmp; +		ctx->contending_lock = lock; + +		ww_mutex_unlock(lock); + +		return -EDEADLK; +	} +#endif + +	return 0; +} + +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); +	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, +				   0, &ctx->dep_map, _RET_IP_, ctx, 1); +	if (!ret && ctx->acquired > 1) +		return ww_mutex_deadlock_injection(lock, ctx); + +	return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); +	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, +				  0, &ctx->dep_map, _RET_IP_, ctx, 1); + +	if (!ret && ctx->acquired > 1) +		return ww_mutex_deadlock_injection(lock, ctx); + +	return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); + +#endif + +/* + * Release the lock, slowpath: + */ +static inline void +__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) +{ +	struct mutex *lock = container_of(lock_count, struct mutex, count); +	unsigned long flags; + +	/* +	 * some architectures leave the lock unlocked in the fastpath failure +	 * case, others need to leave it locked. In the later case we have to +	 * unlock it here +	 */ +	if (__mutex_slowpath_needs_to_unlock()) +		atomic_set(&lock->count, 1); + +	spin_lock_mutex(&lock->wait_lock, flags); +	mutex_release(&lock->dep_map, nested, _RET_IP_); +	debug_mutex_unlock(lock); + +	if (!list_empty(&lock->wait_list)) { +		/* get the first entry from the wait-list: */ +		struct mutex_waiter *waiter = +				list_entry(lock->wait_list.next, +					   struct mutex_waiter, list); + +		debug_mutex_wake_waiter(lock, waiter); + +		wake_up_process(waiter->task); +	} + +	spin_unlock_mutex(&lock->wait_lock, flags); +} + +/* + * Release the lock, slowpath: + */ +__visible void +__mutex_unlock_slowpath(atomic_t *lock_count) +{ +	__mutex_unlock_common_slowpath(lock_count, 1); +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Here come the less common (and hence less performance-critical) APIs: + * mutex_lock_interruptible() and mutex_trylock(). + */ +static noinline int __sched +__mutex_lock_killable_slowpath(struct mutex *lock); + +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock); + +/** + * mutex_lock_interruptible - acquire the mutex, interruptible + * @lock: the mutex to be acquired + * + * Lock the mutex like mutex_lock(), and return 0 if the mutex has + * been acquired or sleep until the mutex becomes available. If a + * signal arrives while waiting for the lock then this function + * returns -EINTR. + * + * This function is similar to (but not equivalent to) down_interruptible(). + */ +int __sched mutex_lock_interruptible(struct mutex *lock) +{ +	int ret; + +	might_sleep(); +	ret =  __mutex_fastpath_lock_retval(&lock->count); +	if (likely(!ret)) { +		mutex_set_owner(lock); +		return 0; +	} else +		return __mutex_lock_interruptible_slowpath(lock); +} + +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ +	int ret; + +	might_sleep(); +	ret = __mutex_fastpath_lock_retval(&lock->count); +	if (likely(!ret)) { +		mutex_set_owner(lock); +		return 0; +	} else +		return __mutex_lock_killable_slowpath(lock); +} +EXPORT_SYMBOL(mutex_lock_killable); + +__visible void __sched +__mutex_lock_slowpath(atomic_t *lock_count) +{ +	struct mutex *lock = container_of(lock_count, struct mutex, count); + +	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, +			    NULL, _RET_IP_, NULL, 0); +} + +static noinline int __sched +__mutex_lock_killable_slowpath(struct mutex *lock) +{ +	return __mutex_lock_common(lock, TASK_KILLABLE, 0, +				   NULL, _RET_IP_, NULL, 0); +} + +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock) +{ +	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, +				   NULL, _RET_IP_, NULL, 0); +} + +static noinline int __sched +__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, +				   NULL, _RET_IP_, ctx, 1); +} + +static noinline int __sched +__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, +					    struct ww_acquire_ctx *ctx) +{ +	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, +				   NULL, _RET_IP_, ctx, 1); +} + +#endif + +/* + * Spinlock based trylock, we take the spinlock and check whether we + * can get the lock: + */ +static inline int __mutex_trylock_slowpath(atomic_t *lock_count) +{ +	struct mutex *lock = container_of(lock_count, struct mutex, count); +	unsigned long flags; +	int prev; + +	spin_lock_mutex(&lock->wait_lock, flags); + +	prev = atomic_xchg(&lock->count, -1); +	if (likely(prev == 1)) { +		mutex_set_owner(lock); +		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); +	} + +	/* Set it back to 0 if there are no waiters: */ +	if (likely(list_empty(&lock->wait_list))) +		atomic_set(&lock->count, 0); + +	spin_unlock_mutex(&lock->wait_lock, flags); + +	return prev == 1; +} + +/** + * mutex_trylock - try to acquire the mutex, without waiting + * @lock: the mutex to be acquired + * + * Try to acquire the mutex atomically. Returns 1 if the mutex + * has been acquired successfully, and 0 on contention. + * + * NOTE: this function follows the spin_trylock() convention, so + * it is negated from the down_trylock() return values! Be careful + * about this when converting semaphore users to mutexes. + * + * This function must not be used in interrupt context. The + * mutex must be released by the same task that acquired it. + */ +int __sched mutex_trylock(struct mutex *lock) +{ +	int ret; + +	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); +	if (ret) +		mutex_set_owner(lock); + +	return ret; +} +EXPORT_SYMBOL(mutex_trylock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); + +	ret = __mutex_fastpath_lock_retval(&lock->base.count); + +	if (likely(!ret)) { +		ww_mutex_set_context_fastpath(lock, ctx); +		mutex_set_owner(&lock->base); +	} else +		ret = __ww_mutex_lock_slowpath(lock, ctx); +	return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +	int ret; + +	might_sleep(); + +	ret = __mutex_fastpath_lock_retval(&lock->base.count); + +	if (likely(!ret)) { +		ww_mutex_set_context_fastpath(lock, ctx); +		mutex_set_owner(&lock->base); +	} else +		ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); +	return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock_interruptible); + +#endif + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ +	/* dec if we can't possibly hit 0 */ +	if (atomic_add_unless(cnt, -1, 1)) +		return 0; +	/* we might hit 0, so take the lock */ +	mutex_lock(lock); +	if (!atomic_dec_and_test(cnt)) { +		/* when we actually did the dec, we didn't hit 0 */ +		mutex_unlock(lock); +		return 0; +	} +	/* we hit 0, and we hold the lock */ +	return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h new file mode 100644 index 00000000000..4115fbf83b1 --- /dev/null +++ b/kernel/locking/mutex.h @@ -0,0 +1,48 @@ +/* + * Mutexes: blocking mutual exclusion locks + * + * started by Ingo Molnar: + * + *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * This file contains mutex debugging related internal prototypes, for the + * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: + */ + +#define spin_lock_mutex(lock, flags) \ +		do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ +		do { spin_unlock(lock); (void)(flags); } while (0) +#define mutex_remove_waiter(lock, waiter, ti) \ +		__list_del((waiter)->list.prev, (waiter)->list.next) + +#ifdef CONFIG_SMP +static inline void mutex_set_owner(struct mutex *lock) +{ +	lock->owner = current; +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +	lock->owner = NULL; +} +#else +static inline void mutex_set_owner(struct mutex *lock) +{ +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +} +#endif + +#define debug_mutex_wake_waiter(lock, waiter)		do { } while (0) +#define debug_mutex_free_waiter(waiter)			do { } while (0) +#define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0) +#define debug_mutex_unlock(lock)			do { } while (0) +#define debug_mutex_init(lock, name, key)		do { } while (0) + +static inline void +debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ +} diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c new file mode 100644 index 00000000000..652a8ee8efe --- /dev/null +++ b/kernel/locking/percpu-rwsem.c @@ -0,0 +1,165 @@ +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/percpu.h> +#include <linux/wait.h> +#include <linux/lockdep.h> +#include <linux/percpu-rwsem.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/errno.h> + +int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +			const char *name, struct lock_class_key *rwsem_key) +{ +	brw->fast_read_ctr = alloc_percpu(int); +	if (unlikely(!brw->fast_read_ctr)) +		return -ENOMEM; + +	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ +	__init_rwsem(&brw->rw_sem, name, rwsem_key); +	atomic_set(&brw->write_ctr, 0); +	atomic_set(&brw->slow_read_ctr, 0); +	init_waitqueue_head(&brw->write_waitq); +	return 0; +} + +void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +{ +	free_percpu(brw->fast_read_ctr); +	brw->fast_read_ctr = NULL; /* catch use after free bugs */ +} + +/* + * This is the fast-path for down_read/up_read, it only needs to ensure + * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the + * fast per-cpu counter. The writer uses synchronize_sched_expedited() to + * serialize with the preempt-disabled section below. + * + * The nontrivial part is that we should guarantee acquire/release semantics + * in case when + * + *	R_W: down_write() comes after up_read(), the writer should see all + *	     changes done by the reader + * or + *	W_R: down_read() comes after up_write(), the reader should see all + *	     changes done by the writer + * + * If this helper fails the callers rely on the normal rw_semaphore and + * atomic_dec_and_test(), so in this case we have the necessary barriers. + * + * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or + * __this_cpu_add() below can be reordered with any LOAD/STORE done by the + * reader inside the critical section. See the comments in down_write and + * up_write below. + */ +static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +{ +	bool success = false; + +	preempt_disable(); +	if (likely(!atomic_read(&brw->write_ctr))) { +		__this_cpu_add(*brw->fast_read_ctr, val); +		success = true; +	} +	preempt_enable(); + +	return success; +} + +/* + * Like the normal down_read() this is not recursive, the writer can + * come after the first percpu_down_read() and create the deadlock. + * + * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, + * percpu_up_read() does rwsem_release(). This pairs with the usage + * of ->rw_sem in percpu_down/up_write(). + */ +void percpu_down_read(struct percpu_rw_semaphore *brw) +{ +	might_sleep(); +	if (likely(update_fast_ctr(brw, +1))) { +		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); +		return; +	} + +	down_read(&brw->rw_sem); +	atomic_inc(&brw->slow_read_ctr); +	/* avoid up_read()->rwsem_release() */ +	__up_read(&brw->rw_sem); +} + +void percpu_up_read(struct percpu_rw_semaphore *brw) +{ +	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); + +	if (likely(update_fast_ctr(brw, -1))) +		return; + +	/* false-positive is possible but harmless */ +	if (atomic_dec_and_test(&brw->slow_read_ctr)) +		wake_up_all(&brw->write_waitq); +} + +static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +{ +	unsigned int sum = 0; +	int cpu; + +	for_each_possible_cpu(cpu) { +		sum += per_cpu(*brw->fast_read_ctr, cpu); +		per_cpu(*brw->fast_read_ctr, cpu) = 0; +	} + +	return sum; +} + +/* + * A writer increments ->write_ctr to force the readers to switch to the + * slow mode, note the atomic_read() check in update_fast_ctr(). + * + * After that the readers can only inc/dec the slow ->slow_read_ctr counter, + * ->fast_read_ctr is stable. Once the writer moves its sum into the slow + * counter it represents the number of active readers. + * + * Finally the writer takes ->rw_sem for writing and blocks the new readers, + * then waits until the slow counter becomes zero. + */ +void percpu_down_write(struct percpu_rw_semaphore *brw) +{ +	/* tell update_fast_ctr() there is a pending writer */ +	atomic_inc(&brw->write_ctr); +	/* +	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read +	 *    so that update_fast_ctr() can't succeed. +	 * +	 * 2. Ensures we see the result of every previous this_cpu_add() in +	 *    update_fast_ctr(). +	 * +	 * 3. Ensures that if any reader has exited its critical section via +	 *    fast-path, it executes a full memory barrier before we return. +	 *    See R_W case in the comment above update_fast_ctr(). +	 */ +	synchronize_sched_expedited(); + +	/* exclude other writers, and block the new readers completely */ +	down_write(&brw->rw_sem); + +	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ +	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + +	/* wait for all readers to complete their percpu_up_read() */ +	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); +} + +void percpu_up_write(struct percpu_rw_semaphore *brw) +{ +	/* release the lock, but the readers can't use the fast-path */ +	up_write(&brw->rw_sem); +	/* +	 * Insert the barrier before the next fast-path in down_read, +	 * see W_R case in the comment above update_fast_ctr(). +	 */ +	synchronize_sched_expedited(); +	/* the last writer unblocks update_fast_ctr() */ +	atomic_dec(&brw->write_ctr); +} diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 00000000000..fb5b8ac411a --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long <waiman.long@hp.com> + */ +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <asm/qrwlock.h> + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock  : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ +	while ((cnts & _QW_WMASK) == _QW_LOCKED) { +		arch_mutex_cpu_relax(); +		cnts = smp_load_acquire((u32 *)&lock->cnts); +	} +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ +	u32 cnts; + +	/* +	 * Readers come here when they cannot get the lock without waiting +	 */ +	if (unlikely(in_interrupt())) { +		/* +		 * Readers in interrupt context will spin until the lock is +		 * available without waiting in the queue. +		 */ +		cnts = smp_load_acquire((u32 *)&lock->cnts); +		rspin_until_writer_unlock(lock, cnts); +		return; +	} +	atomic_sub(_QR_BIAS, &lock->cnts); + +	/* +	 * Put the reader into the wait queue +	 */ +	arch_spin_lock(&lock->lock); + +	/* +	 * At the head of the wait queue now, wait until the writer state +	 * goes to 0 and then try to increment the reader count and get +	 * the lock. It is possible that an incoming writer may steal the +	 * lock in the interim, so it is necessary to check the writer byte +	 * to make sure that the write lock isn't taken. +	 */ +	while (atomic_read(&lock->cnts) & _QW_WMASK) +		arch_mutex_cpu_relax(); + +	cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; +	rspin_until_writer_unlock(lock, cnts); + +	/* +	 * Signal the next one in queue to become queue head +	 */ +	arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ +	u32 cnts; + +	/* Put the writer into the wait queue */ +	arch_spin_lock(&lock->lock); + +	/* Try to acquire the lock directly if no reader is present */ +	if (!atomic_read(&lock->cnts) && +	    (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) +		goto unlock; + +	/* +	 * Set the waiting flag to notify readers that a writer is pending, +	 * or wait for a previous writer to go away. +	 */ +	for (;;) { +		cnts = atomic_read(&lock->cnts); +		if (!(cnts & _QW_WMASK) && +		    (atomic_cmpxchg(&lock->cnts, cnts, +				    cnts | _QW_WAITING) == cnts)) +			break; + +		arch_mutex_cpu_relax(); +	} + +	/* When no more readers, set the locked flag */ +	for (;;) { +		cnts = atomic_read(&lock->cnts); +		if ((cnts == _QW_WAITING) && +		    (atomic_cmpxchg(&lock->cnts, _QW_WAITING, +				    _QW_LOCKED) == _QW_WAITING)) +			break; + +		arch_mutex_cpu_relax(); +	} +unlock: +	arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c new file mode 100644 index 00000000000..49b2ed3dced --- /dev/null +++ b/kernel/locking/rtmutex-debug.c @@ -0,0 +1,183 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This code is based on the rt.c implementation in the preempt-rt tree. + * Portions of said code are + * + *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey + *  Copyright (C) 2006  Esben Nielsen + *  Copyright (C) 2006  Kihon Technologies Inc., + *			Steven Rostedt <rostedt@goodmis.org> + * + * See rt.c in preempt-rt for proper credits and further information + */ +#include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/rbtree.h> +#include <linux/fs.h> +#include <linux/debug_locks.h> + +#include "rtmutex_common.h" + +static void printk_task(struct task_struct *p) +{ +	if (p) +		printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); +	else +		printk("<none>"); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ +	if (lock->name) +		printk(" [%p] {%s}\n", +			lock, lock->name); +	else +		printk(" [%p] {%s:%d}\n", +			lock, lock->file, lock->line); + +	if (print_owner && rt_mutex_owner(lock)) { +		printk(".. ->owner: %p\n", lock->owner); +		printk(".. held by:  "); +		printk_task(rt_mutex_owner(lock)); +		printk("\n"); +	} +} + +void rt_mutex_debug_task_free(struct task_struct *task) +{ +	DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); +	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} + +/* + * We fill out the fields in the waiter to store the information about + * the deadlock. We print when we return. act_waiter can be NULL in + * case of a remove waiter operation. + */ +void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, +			     struct rt_mutex *lock) +{ +	struct task_struct *task; + +	if (!debug_locks || detect || !act_waiter) +		return; + +	task = rt_mutex_owner(act_waiter->lock); +	if (task && task != current) { +		act_waiter->deadlock_task_pid = get_pid(task_pid(task)); +		act_waiter->deadlock_lock = lock; +	} +} + +void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +{ +	struct task_struct *task; + +	if (!waiter->deadlock_lock || !debug_locks) +		return; + +	rcu_read_lock(); +	task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); +	if (!task) { +		rcu_read_unlock(); +		return; +	} + +	if (!debug_locks_off()) { +		rcu_read_unlock(); +		return; +	} + +	printk("\n============================================\n"); +	printk(  "[ BUG: circular locking deadlock detected! ]\n"); +	printk("%s\n", print_tainted()); +	printk(  "--------------------------------------------\n"); +	printk("%s/%d is deadlocking current task %s/%d\n\n", +	       task->comm, task_pid_nr(task), +	       current->comm, task_pid_nr(current)); + +	printk("\n1) %s/%d is trying to acquire this lock:\n", +	       current->comm, task_pid_nr(current)); +	printk_lock(waiter->lock, 1); + +	printk("\n2) %s/%d is blocked on this lock:\n", +		task->comm, task_pid_nr(task)); +	printk_lock(waiter->deadlock_lock, 1); + +	debug_show_held_locks(current); +	debug_show_held_locks(task); + +	printk("\n%s/%d's [blocked] stackdump:\n\n", +		task->comm, task_pid_nr(task)); +	show_stack(task, NULL); +	printk("\n%s/%d's [current] stackdump:\n\n", +		current->comm, task_pid_nr(current)); +	dump_stack(); +	debug_show_all_locks(); +	rcu_read_unlock(); + +	printk("[ turning off deadlock detection." +	       "Please report this trace. ]\n\n"); +} + +void debug_rt_mutex_lock(struct rt_mutex *lock) +{ +} + +void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ +	DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); +} + +void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) +{ +} + +void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ +	DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); +} + +void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ +	memset(waiter, 0x11, sizeof(*waiter)); +	waiter->deadlock_task_pid = NULL; +} + +void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ +	put_pid(waiter->deadlock_task_pid); +	memset(waiter, 0x22, sizeof(*waiter)); +} + +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +{ +	/* +	 * Make sure we are not reinitializing a held lock: +	 */ +	debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +	lock->name = name; +} + +void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) +{ +} + +void rt_mutex_deadlock_account_unlock(struct task_struct *task) +{ +} + diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h new file mode 100644 index 00000000000..ab29b6a2266 --- /dev/null +++ b/kernel/locking/rtmutex-debug.h @@ -0,0 +1,38 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +extern void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); +extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_lock(struct rt_mutex *lock); +extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, +				      struct task_struct *powner); +extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, +				    struct rt_mutex *lock); +extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +# define debug_rt_mutex_reset_waiter(w)			\ +	do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, +						 int detect) +{ +	return (waiter != NULL); +} + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ +	debug_rt_mutex_print_deadlock(w); +} diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c new file mode 100644 index 00000000000..1d96dd0d93c --- /dev/null +++ b/kernel/locking/rtmutex-tester.c @@ -0,0 +1,420 @@ +/* + * RT-Mutex-tester: scriptable tester for rt mutexes + * + * started by Thomas Gleixner: + * + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + */ +#include <linux/device.h> +#include <linux/kthread.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/spinlock.h> +#include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/stat.h> + +#include "rtmutex.h" + +#define MAX_RT_TEST_THREADS	8 +#define MAX_RT_TEST_MUTEXES	8 + +static spinlock_t rttest_lock; +static atomic_t rttest_event; + +struct test_thread_data { +	int			opcode; +	int			opdata; +	int			mutexes[MAX_RT_TEST_MUTEXES]; +	int			event; +	struct device		dev; +}; + +static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; +static struct task_struct *threads[MAX_RT_TEST_THREADS]; +static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; + +enum test_opcodes { +	RTTEST_NOP = 0, +	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */ +	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */ +	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */ +	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ +	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */ +	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */ +	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */ +	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */ +	/* 9, 10 - reserved for BKL commemoration */ +	RTTEST_SIGNAL = 11,	/* 11 Signal other test thread, data = thread id */ +	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */ +	RTTEST_RESET = 99,	/* 99 Reset all pending operations */ +}; + +static int handle_op(struct test_thread_data *td, int lockwakeup) +{ +	int i, id, ret = -EINVAL; + +	switch(td->opcode) { + +	case RTTEST_NOP: +		return 0; + +	case RTTEST_LOCKCONT: +		td->mutexes[td->opdata] = 1; +		td->event = atomic_add_return(1, &rttest_event); +		return 0; + +	case RTTEST_RESET: +		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { +			if (td->mutexes[i] == 4) { +				rt_mutex_unlock(&mutexes[i]); +				td->mutexes[i] = 0; +			} +		} +		return 0; + +	case RTTEST_RESETEVENT: +		atomic_set(&rttest_event, 0); +		return 0; + +	default: +		if (lockwakeup) +			return ret; +	} + +	switch(td->opcode) { + +	case RTTEST_LOCK: +	case RTTEST_LOCKNOWAIT: +		id = td->opdata; +		if (id < 0 || id >= MAX_RT_TEST_MUTEXES) +			return ret; + +		td->mutexes[id] = 1; +		td->event = atomic_add_return(1, &rttest_event); +		rt_mutex_lock(&mutexes[id]); +		td->event = atomic_add_return(1, &rttest_event); +		td->mutexes[id] = 4; +		return 0; + +	case RTTEST_LOCKINT: +	case RTTEST_LOCKINTNOWAIT: +		id = td->opdata; +		if (id < 0 || id >= MAX_RT_TEST_MUTEXES) +			return ret; + +		td->mutexes[id] = 1; +		td->event = atomic_add_return(1, &rttest_event); +		ret = rt_mutex_lock_interruptible(&mutexes[id], 0); +		td->event = atomic_add_return(1, &rttest_event); +		td->mutexes[id] = ret ? 0 : 4; +		return ret ? -EINTR : 0; + +	case RTTEST_UNLOCK: +		id = td->opdata; +		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) +			return ret; + +		td->event = atomic_add_return(1, &rttest_event); +		rt_mutex_unlock(&mutexes[id]); +		td->event = atomic_add_return(1, &rttest_event); +		td->mutexes[id] = 0; +		return 0; + +	default: +		break; +	} +	return ret; +} + +/* + * Schedule replacement for rtsem_down(). Only called for threads with + * PF_MUTEX_TESTER set. + * + * This allows us to have finegrained control over the event flow. + * + */ +void schedule_rt_mutex_test(struct rt_mutex *mutex) +{ +	int tid, op, dat; +	struct test_thread_data *td; + +	/* We have to lookup the task */ +	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { +		if (threads[tid] == current) +			break; +	} + +	BUG_ON(tid == MAX_RT_TEST_THREADS); + +	td = &thread_data[tid]; + +	op = td->opcode; +	dat = td->opdata; + +	switch (op) { +	case RTTEST_LOCK: +	case RTTEST_LOCKINT: +	case RTTEST_LOCKNOWAIT: +	case RTTEST_LOCKINTNOWAIT: +		if (mutex != &mutexes[dat]) +			break; + +		if (td->mutexes[dat] != 1) +			break; + +		td->mutexes[dat] = 2; +		td->event = atomic_add_return(1, &rttest_event); +		break; + +	default: +		break; +	} + +	schedule(); + + +	switch (op) { +	case RTTEST_LOCK: +	case RTTEST_LOCKINT: +		if (mutex != &mutexes[dat]) +			return; + +		if (td->mutexes[dat] != 2) +			return; + +		td->mutexes[dat] = 3; +		td->event = atomic_add_return(1, &rttest_event); +		break; + +	case RTTEST_LOCKNOWAIT: +	case RTTEST_LOCKINTNOWAIT: +		if (mutex != &mutexes[dat]) +			return; + +		if (td->mutexes[dat] != 2) +			return; + +		td->mutexes[dat] = 1; +		td->event = atomic_add_return(1, &rttest_event); +		return; + +	default: +		return; +	} + +	td->opcode = 0; + +	for (;;) { +		set_current_state(TASK_INTERRUPTIBLE); + +		if (td->opcode > 0) { +			int ret; + +			set_current_state(TASK_RUNNING); +			ret = handle_op(td, 1); +			set_current_state(TASK_INTERRUPTIBLE); +			if (td->opcode == RTTEST_LOCKCONT) +				break; +			td->opcode = ret; +		} + +		/* Wait for the next command to be executed */ +		schedule(); +	} + +	/* Restore previous command and data */ +	td->opcode = op; +	td->opdata = dat; +} + +static int test_func(void *data) +{ +	struct test_thread_data *td = data; +	int ret; + +	current->flags |= PF_MUTEX_TESTER; +	set_freezable(); +	allow_signal(SIGHUP); + +	for(;;) { + +		set_current_state(TASK_INTERRUPTIBLE); + +		if (td->opcode > 0) { +			set_current_state(TASK_RUNNING); +			ret = handle_op(td, 0); +			set_current_state(TASK_INTERRUPTIBLE); +			td->opcode = ret; +		} + +		/* Wait for the next command to be executed */ +		schedule(); +		try_to_freeze(); + +		if (signal_pending(current)) +			flush_signals(current); + +		if(kthread_should_stop()) +			break; +	} +	return 0; +} + +/** + * sysfs_test_command - interface for test commands + * @dev:	thread reference + * @buf:	command for actual step + * @count:	length of buffer + * + * command syntax: + * + * opcode:data + */ +static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, +				  const char *buf, size_t count) +{ +	struct sched_param schedpar; +	struct test_thread_data *td; +	char cmdbuf[32]; +	int op, dat, tid, ret; + +	td = container_of(dev, struct test_thread_data, dev); +	tid = td->dev.id; + +	/* strings from sysfs write are not 0 terminated! */ +	if (count >= sizeof(cmdbuf)) +		return -EINVAL; + +	/* strip of \n: */ +	if (buf[count-1] == '\n') +		count--; +	if (count < 1) +		return -EINVAL; + +	memcpy(cmdbuf, buf, count); +	cmdbuf[count] = 0; + +	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) +		return -EINVAL; + +	switch (op) { +	case RTTEST_SCHEDOT: +		schedpar.sched_priority = 0; +		ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); +		if (ret) +			return ret; +		set_user_nice(current, 0); +		break; + +	case RTTEST_SCHEDRT: +		schedpar.sched_priority = dat; +		ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); +		if (ret) +			return ret; +		break; + +	case RTTEST_SIGNAL: +		send_sig(SIGHUP, threads[tid], 0); +		break; + +	default: +		if (td->opcode > 0) +			return -EBUSY; +		td->opdata = dat; +		td->opcode = op; +		wake_up_process(threads[tid]); +	} + +	return count; +} + +/** + * sysfs_test_status - sysfs interface for rt tester + * @dev:	thread to query + * @buf:	char buffer to be filled with thread status info + */ +static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, +				 char *buf) +{ +	struct test_thread_data *td; +	struct task_struct *tsk; +	char *curr = buf; +	int i; + +	td = container_of(dev, struct test_thread_data, dev); +	tsk = threads[td->dev.id]; + +	spin_lock(&rttest_lock); + +	curr += sprintf(curr, +		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", +		td->opcode, td->event, tsk->state, +			(MAX_RT_PRIO - 1) - tsk->prio, +			(MAX_RT_PRIO - 1) - tsk->normal_prio, +		tsk->pi_blocked_on); + +	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) +		curr += sprintf(curr, "%d", td->mutexes[i]); + +	spin_unlock(&rttest_lock); + +	curr += sprintf(curr, ", T: %p, R: %p\n", tsk, +			mutexes[td->dev.id].owner); + +	return curr - buf; +} + +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); + +static struct bus_type rttest_subsys = { +	.name = "rttest", +	.dev_name = "rttest", +}; + +static int init_test_thread(int id) +{ +	thread_data[id].dev.bus = &rttest_subsys; +	thread_data[id].dev.id = id; + +	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); +	if (IS_ERR(threads[id])) +		return PTR_ERR(threads[id]); + +	return device_register(&thread_data[id].dev); +} + +static int init_rttest(void) +{ +	int ret, i; + +	spin_lock_init(&rttest_lock); + +	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) +		rt_mutex_init(&mutexes[i]); + +	ret = subsys_system_register(&rttest_subsys, NULL); +	if (ret) +		return ret; + +	for (i = 0; i < MAX_RT_TEST_THREADS; i++) { +		ret = init_test_thread(i); +		if (ret) +			break; +		ret = device_create_file(&thread_data[i].dev, &dev_attr_status); +		if (ret) +			break; +		ret = device_create_file(&thread_data[i].dev, &dev_attr_command); +		if (ret) +			break; +	} + +	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); + +	return ret; +} + +device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c new file mode 100644 index 00000000000..fc605941b9b --- /dev/null +++ b/kernel/locking/rtmutex.c @@ -0,0 +1,1373 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + *  Copyright (C) 2006 Esben Nielsen + * + *  See Documentation/rt-mutex-design.txt for details. + */ +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> +#include <linux/timer.h> + +#include "rtmutex_common.h" + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. + * + * owner	bit0 + * NULL		0	lock is free (fast acquire possible) + * NULL		1	lock is free and has waiters and the top waiter + *				is going to take the lock* + * taskpointer	0	lock is held (fast release possible) + * taskpointer	1	lock is held and has waiters** + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. + * + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +{ +	unsigned long val = (unsigned long)owner; + +	if (rt_mutex_has_waiters(lock)) +		val |= RT_MUTEX_HAS_WAITERS; + +	lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ +	lock->owner = (struct task_struct *) +			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ +	if (!rt_mutex_has_waiters(lock)) +		clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ +	unsigned long owner, *p = (unsigned long *) &lock->owner; + +	do { +		owner = *p; +	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} + +/* + * Safe fastpath aware unlock: + * 1) Clear the waiters bit + * 2) Drop lock->wait_lock + * 3) Try to unlock the lock with cmpxchg + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +	__releases(lock->wait_lock) +{ +	struct task_struct *owner = rt_mutex_owner(lock); + +	clear_rt_mutex_waiters(lock); +	raw_spin_unlock(&lock->wait_lock); +	/* +	 * If a new waiter comes in between the unlock and the cmpxchg +	 * we have two situations: +	 * +	 * unlock(wait_lock); +	 *					lock(wait_lock); +	 * cmpxchg(p, owner, 0) == owner +	 *					mark_rt_mutex_waiters(lock); +	 *					acquire(lock); +	 * or: +	 * +	 * unlock(wait_lock); +	 *					lock(wait_lock); +	 *					mark_rt_mutex_waiters(lock); +	 * +	 * cmpxchg(p, owner, 0) != owner +	 *					enqueue_waiter(); +	 *					unlock(wait_lock); +	 * lock(wait_lock); +	 * wake waiter(); +	 * unlock(wait_lock); +	 *					lock(wait_lock); +	 *					acquire(lock); +	 */ +	return rt_mutex_cmpxchg(lock, owner, NULL); +} + +#else +# define rt_mutex_cmpxchg(l,c,n)	(0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ +	lock->owner = (struct task_struct *) +			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} + +/* + * Simple slow path only version: lock->owner is protected by lock->wait_lock. + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +	__releases(lock->wait_lock) +{ +	lock->owner = NULL; +	raw_spin_unlock(&lock->wait_lock); +	return true; +} +#endif + +static inline int +rt_mutex_waiter_less(struct rt_mutex_waiter *left, +		     struct rt_mutex_waiter *right) +{ +	if (left->prio < right->prio) +		return 1; + +	/* +	 * If both waiters have dl_prio(), we check the deadlines of the +	 * associated tasks. +	 * If left waiter has a dl_prio(), and we didn't return 1 above, +	 * then right waiter has a dl_prio() too. +	 */ +	if (dl_prio(left->prio)) +		return (left->task->dl.deadline < right->task->dl.deadline); + +	return 0; +} + +static void +rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ +	struct rb_node **link = &lock->waiters.rb_node; +	struct rb_node *parent = NULL; +	struct rt_mutex_waiter *entry; +	int leftmost = 1; + +	while (*link) { +		parent = *link; +		entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); +		if (rt_mutex_waiter_less(waiter, entry)) { +			link = &parent->rb_left; +		} else { +			link = &parent->rb_right; +			leftmost = 0; +		} +	} + +	if (leftmost) +		lock->waiters_leftmost = &waiter->tree_entry; + +	rb_link_node(&waiter->tree_entry, parent, link); +	rb_insert_color(&waiter->tree_entry, &lock->waiters); +} + +static void +rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ +	if (RB_EMPTY_NODE(&waiter->tree_entry)) +		return; + +	if (lock->waiters_leftmost == &waiter->tree_entry) +		lock->waiters_leftmost = rb_next(&waiter->tree_entry); + +	rb_erase(&waiter->tree_entry, &lock->waiters); +	RB_CLEAR_NODE(&waiter->tree_entry); +} + +static void +rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ +	struct rb_node **link = &task->pi_waiters.rb_node; +	struct rb_node *parent = NULL; +	struct rt_mutex_waiter *entry; +	int leftmost = 1; + +	while (*link) { +		parent = *link; +		entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); +		if (rt_mutex_waiter_less(waiter, entry)) { +			link = &parent->rb_left; +		} else { +			link = &parent->rb_right; +			leftmost = 0; +		} +	} + +	if (leftmost) +		task->pi_waiters_leftmost = &waiter->pi_tree_entry; + +	rb_link_node(&waiter->pi_tree_entry, parent, link); +	rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); +} + +static void +rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ +	if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) +		return; + +	if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) +		task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); + +	rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); +	RB_CLEAR_NODE(&waiter->pi_tree_entry); +} + +/* + * Calculate task priority from the waiter tree priority + * + * Return task->normal_prio when the waiter tree is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ +	if (likely(!task_has_pi_waiters(task))) +		return task->normal_prio; + +	return min(task_top_pi_waiter(task)->prio, +		   task->normal_prio); +} + +struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ +	if (likely(!task_has_pi_waiters(task))) +		return NULL; + +	return task_top_pi_waiter(task)->task; +} + +/* + * Called by sched_setscheduler() to check whether the priority change + * is overruled by a possible priority boosting. + */ +int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ +	if (!task_has_pi_waiters(task)) +		return 0; + +	return task_top_pi_waiter(task)->task->prio <= newprio; +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ +	int prio = rt_mutex_getprio(task); + +	if (task->prio != prio || dl_prio(prio)) +		rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&task->pi_lock, flags); +	__rt_mutex_adjust_prio(task); +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +{ +	return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; +} + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * + * @task:	the task owning the mutex (owner) for which a chain walk is + *		probably needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock:	the mutex (can be NULL if we are walking the chain to recheck + *		things for a task that has just got its priority adjusted, and + *		is waiting on a mutex) + * @next_lock:	the mutex on which the owner of @orig_lock was blocked before + *		we dropped its pi_lock. Is never dereferenced, only used for + *		comparison to detect lock chain changes. + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + *		its priority to the mutex owner (can be NULL in the case + *		depicted above or if the top waiter is gone away and we are + *		actually deboosting the owner) + * @top_task:	the current top waiter + * + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(struct task_struct *task, +				      int deadlock_detect, +				      struct rt_mutex *orig_lock, +				      struct rt_mutex *next_lock, +				      struct rt_mutex_waiter *orig_waiter, +				      struct task_struct *top_task) +{ +	struct rt_mutex *lock; +	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; +	int detect_deadlock, ret = 0, depth = 0; +	unsigned long flags; + +	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, +							 deadlock_detect); + +	/* +	 * The (de)boosting is a step by step approach with a lot of +	 * pitfalls. We want this to be preemptible and we want hold a +	 * maximum of two locks per step. So we have to check +	 * carefully whether things change under us. +	 */ + again: +	if (++depth > max_lock_depth) { +		static int prev_max; + +		/* +		 * Print this only once. If the admin changes the limit, +		 * print a new message when reaching the limit again. +		 */ +		if (prev_max != max_lock_depth) { +			prev_max = max_lock_depth; +			printk(KERN_WARNING "Maximum lock depth %d reached " +			       "task: %s (%d)\n", max_lock_depth, +			       top_task->comm, task_pid_nr(top_task)); +		} +		put_task_struct(task); + +		return -EDEADLK; +	} + retry: +	/* +	 * Task can not go away as we did a get_task() before ! +	 */ +	raw_spin_lock_irqsave(&task->pi_lock, flags); + +	waiter = task->pi_blocked_on; +	/* +	 * Check whether the end of the boosting chain has been +	 * reached or the state of the chain has changed while we +	 * dropped the locks. +	 */ +	if (!waiter) +		goto out_unlock_pi; + +	/* +	 * Check the orig_waiter state. After we dropped the locks, +	 * the previous owner of the lock might have released the lock. +	 */ +	if (orig_waiter && !rt_mutex_owner(orig_lock)) +		goto out_unlock_pi; + +	/* +	 * We dropped all locks after taking a refcount on @task, so +	 * the task might have moved on in the lock chain or even left +	 * the chain completely and blocks now on an unrelated lock or +	 * on @orig_lock. +	 * +	 * We stored the lock on which @task was blocked in @next_lock, +	 * so we can detect the chain change. +	 */ +	if (next_lock != waiter->lock) +		goto out_unlock_pi; + +	/* +	 * Drop out, when the task has no waiters. Note, +	 * top_waiter can be NULL, when we are in the deboosting +	 * mode! +	 */ +	if (top_waiter) { +		if (!task_has_pi_waiters(task)) +			goto out_unlock_pi; +		/* +		 * If deadlock detection is off, we stop here if we +		 * are not the top pi waiter of the task. +		 */ +		if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) +			goto out_unlock_pi; +	} + +	/* +	 * When deadlock detection is off then we check, if further +	 * priority adjustment is necessary. +	 */ +	if (!detect_deadlock && waiter->prio == task->prio) +		goto out_unlock_pi; + +	lock = waiter->lock; +	if (!raw_spin_trylock(&lock->wait_lock)) { +		raw_spin_unlock_irqrestore(&task->pi_lock, flags); +		cpu_relax(); +		goto retry; +	} + +	/* +	 * Deadlock detection. If the lock is the same as the original +	 * lock which caused us to walk the lock chain or if the +	 * current lock is owned by the task which initiated the chain +	 * walk, we detected a deadlock. +	 */ +	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { +		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); +		raw_spin_unlock(&lock->wait_lock); +		ret = -EDEADLK; +		goto out_unlock_pi; +	} + +	top_waiter = rt_mutex_top_waiter(lock); + +	/* Requeue the waiter */ +	rt_mutex_dequeue(lock, waiter); +	waiter->prio = task->prio; +	rt_mutex_enqueue(lock, waiter); + +	/* Release the task */ +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); +	if (!rt_mutex_owner(lock)) { +		/* +		 * If the requeue above changed the top waiter, then we need +		 * to wake the new top waiter up to try to get the lock. +		 */ + +		if (top_waiter != rt_mutex_top_waiter(lock)) +			wake_up_process(rt_mutex_top_waiter(lock)->task); +		raw_spin_unlock(&lock->wait_lock); +		goto out_put_task; +	} +	put_task_struct(task); + +	/* Grab the next task */ +	task = rt_mutex_owner(lock); +	get_task_struct(task); +	raw_spin_lock_irqsave(&task->pi_lock, flags); + +	if (waiter == rt_mutex_top_waiter(lock)) { +		/* Boost the owner */ +		rt_mutex_dequeue_pi(task, top_waiter); +		rt_mutex_enqueue_pi(task, waiter); +		__rt_mutex_adjust_prio(task); + +	} else if (top_waiter == waiter) { +		/* Deboost the owner */ +		rt_mutex_dequeue_pi(task, waiter); +		waiter = rt_mutex_top_waiter(lock); +		rt_mutex_enqueue_pi(task, waiter); +		__rt_mutex_adjust_prio(task); +	} + +	/* +	 * Check whether the task which owns the current lock is pi +	 * blocked itself. If yes we store a pointer to the lock for +	 * the lock chain change detection above. After we dropped +	 * task->pi_lock next_lock cannot be dereferenced anymore. +	 */ +	next_lock = task_blocked_on_lock(task); + +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +	top_waiter = rt_mutex_top_waiter(lock); +	raw_spin_unlock(&lock->wait_lock); + +	/* +	 * We reached the end of the lock chain. Stop right here. No +	 * point to go back just to figure that out. +	 */ +	if (!next_lock) +		goto out_put_task; + +	if (!detect_deadlock && waiter != top_waiter) +		goto out_put_task; + +	goto again; + + out_unlock_pi: +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: +	put_task_struct(task); + +	return ret; +} + +/* + * Try to take an rt-mutex + * + * Must be called with lock->wait_lock held. + * + * @lock:   the lock to be acquired. + * @task:   the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, +		struct rt_mutex_waiter *waiter) +{ +	/* +	 * We have to be careful here if the atomic speedups are +	 * enabled, such that, when +	 *  - no other waiter is on the lock +	 *  - the lock has been released since we did the cmpxchg +	 * the lock can be released or taken while we are doing the +	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS. +	 * +	 * The atomic acquire/release aware variant of +	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting +	 * the WAITERS bit, the atomic release / acquire can not +	 * happen anymore and lock->wait_lock protects us from the +	 * non-atomic case. +	 * +	 * Note, that this might set lock->owner = +	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended +	 * any more. This is fixed up when we take the ownership. +	 * This is the transitional state explained at the top of this file. +	 */ +	mark_rt_mutex_waiters(lock); + +	if (rt_mutex_owner(lock)) +		return 0; + +	/* +	 * It will get the lock because of one of these conditions: +	 * 1) there is no waiter +	 * 2) higher priority than waiters +	 * 3) it is top waiter +	 */ +	if (rt_mutex_has_waiters(lock)) { +		if (task->prio >= rt_mutex_top_waiter(lock)->prio) { +			if (!waiter || waiter != rt_mutex_top_waiter(lock)) +				return 0; +		} +	} + +	if (waiter || rt_mutex_has_waiters(lock)) { +		unsigned long flags; +		struct rt_mutex_waiter *top; + +		raw_spin_lock_irqsave(&task->pi_lock, flags); + +		/* remove the queued waiter. */ +		if (waiter) { +			rt_mutex_dequeue(lock, waiter); +			task->pi_blocked_on = NULL; +		} + +		/* +		 * We have to enqueue the top waiter(if it exists) into +		 * task->pi_waiters list. +		 */ +		if (rt_mutex_has_waiters(lock)) { +			top = rt_mutex_top_waiter(lock); +			rt_mutex_enqueue_pi(task, top); +		} +		raw_spin_unlock_irqrestore(&task->pi_lock, flags); +	} + +	/* We got the lock. */ +	debug_rt_mutex_lock(lock); + +	rt_mutex_set_owner(lock, task); + +	rt_mutex_deadlock_account_lock(lock, task); + +	return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, +				   struct rt_mutex_waiter *waiter, +				   struct task_struct *task, +				   int detect_deadlock) +{ +	struct task_struct *owner = rt_mutex_owner(lock); +	struct rt_mutex_waiter *top_waiter = waiter; +	struct rt_mutex *next_lock; +	int chain_walk = 0, res; +	unsigned long flags; + +	/* +	 * Early deadlock detection. We really don't want the task to +	 * enqueue on itself just to untangle the mess later. It's not +	 * only an optimization. We drop the locks, so another waiter +	 * can come in before the chain walk detects the deadlock. So +	 * the other will detect the deadlock and return -EDEADLOCK, +	 * which is wrong, as the other waiter is not in a deadlock +	 * situation. +	 */ +	if (owner == task) +		return -EDEADLK; + +	raw_spin_lock_irqsave(&task->pi_lock, flags); +	__rt_mutex_adjust_prio(task); +	waiter->task = task; +	waiter->lock = lock; +	waiter->prio = task->prio; + +	/* Get the top priority waiter on the lock */ +	if (rt_mutex_has_waiters(lock)) +		top_waiter = rt_mutex_top_waiter(lock); +	rt_mutex_enqueue(lock, waiter); + +	task->pi_blocked_on = waiter; + +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +	if (!owner) +		return 0; + +	raw_spin_lock_irqsave(&owner->pi_lock, flags); +	if (waiter == rt_mutex_top_waiter(lock)) { +		rt_mutex_dequeue_pi(owner, top_waiter); +		rt_mutex_enqueue_pi(owner, waiter); + +		__rt_mutex_adjust_prio(owner); +		if (owner->pi_blocked_on) +			chain_walk = 1; +	} else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { +		chain_walk = 1; +	} + +	/* Store the lock on which owner is blocked or NULL */ +	next_lock = task_blocked_on_lock(owner); + +	raw_spin_unlock_irqrestore(&owner->pi_lock, flags); +	/* +	 * Even if full deadlock detection is on, if the owner is not +	 * blocked itself, we can avoid finding this out in the chain +	 * walk. +	 */ +	if (!chain_walk || !next_lock) +		return 0; + +	/* +	 * The owner can't disappear while holding a lock, +	 * so the owner struct is protected by wait_lock. +	 * Gets dropped in rt_mutex_adjust_prio_chain()! +	 */ +	get_task_struct(owner); + +	raw_spin_unlock(&lock->wait_lock); + +	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, +					 next_lock, waiter, task); + +	raw_spin_lock(&lock->wait_lock); + +	return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks pi waiter list and + * wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ +	struct rt_mutex_waiter *waiter; +	unsigned long flags; + +	raw_spin_lock_irqsave(¤t->pi_lock, flags); + +	waiter = rt_mutex_top_waiter(lock); + +	/* +	 * Remove it from current->pi_waiters. We do not adjust a +	 * possible priority boost right now. We execute wakeup in the +	 * boosted mode and go back to normal after releasing +	 * lock->wait_lock. +	 */ +	rt_mutex_dequeue_pi(current, waiter); + +	/* +	 * As we are waking up the top waiter, and the waiter stays +	 * queued on the lock until it gets the lock, this lock +	 * obviously has waiters. Just set the bit here and this has +	 * the added benefit of forcing all new tasks into the +	 * slow path making sure no task of lower priority than +	 * the top waiter can steal this lock. +	 */ +	lock->owner = (void *) RT_MUTEX_HAS_WAITERS; + +	raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + +	/* +	 * It's safe to dereference waiter as it cannot go away as +	 * long as we hold lock->wait_lock. The waiter task needs to +	 * acquire it in order to dequeue the waiter. +	 */ +	wake_up_process(waiter->task); +} + +/* + * Remove a waiter from a lock and give up + * + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). + */ +static void remove_waiter(struct rt_mutex *lock, +			  struct rt_mutex_waiter *waiter) +{ +	int first = (waiter == rt_mutex_top_waiter(lock)); +	struct task_struct *owner = rt_mutex_owner(lock); +	struct rt_mutex *next_lock = NULL; +	unsigned long flags; + +	raw_spin_lock_irqsave(¤t->pi_lock, flags); +	rt_mutex_dequeue(lock, waiter); +	current->pi_blocked_on = NULL; +	raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + +	if (!owner) +		return; + +	if (first) { + +		raw_spin_lock_irqsave(&owner->pi_lock, flags); + +		rt_mutex_dequeue_pi(owner, waiter); + +		if (rt_mutex_has_waiters(lock)) { +			struct rt_mutex_waiter *next; + +			next = rt_mutex_top_waiter(lock); +			rt_mutex_enqueue_pi(owner, next); +		} +		__rt_mutex_adjust_prio(owner); + +		/* Store the lock on which owner is blocked or NULL */ +		next_lock = task_blocked_on_lock(owner); + +		raw_spin_unlock_irqrestore(&owner->pi_lock, flags); +	} + +	if (!next_lock) +		return; + +	/* gets dropped in rt_mutex_adjust_prio_chain()! */ +	get_task_struct(owner); + +	raw_spin_unlock(&lock->wait_lock); + +	rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); + +	raw_spin_lock(&lock->wait_lock); +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void rt_mutex_adjust_pi(struct task_struct *task) +{ +	struct rt_mutex_waiter *waiter; +	struct rt_mutex *next_lock; +	unsigned long flags; + +	raw_spin_lock_irqsave(&task->pi_lock, flags); + +	waiter = task->pi_blocked_on; +	if (!waiter || (waiter->prio == task->prio && +			!dl_prio(task->prio))) { +		raw_spin_unlock_irqrestore(&task->pi_lock, flags); +		return; +	} +	next_lock = waiter->lock; +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +	/* gets dropped in rt_mutex_adjust_prio_chain()! */ +	get_task_struct(task); + +	rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); +} + +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock:		 the rt_mutex to take + * @state:		 the state the task should block in (TASK_INTERRUPTIBLE + * 			 or TASK_UNINTERRUPTIBLE) + * @timeout:		 the pre-initialized and started timer, or NULL for none + * @waiter:		 the pre-initialized rt_mutex_waiter + * + * lock->wait_lock must be held by the caller. + */ +static int __sched +__rt_mutex_slowlock(struct rt_mutex *lock, int state, +		    struct hrtimer_sleeper *timeout, +		    struct rt_mutex_waiter *waiter) +{ +	int ret = 0; + +	for (;;) { +		/* Try to acquire the lock: */ +		if (try_to_take_rt_mutex(lock, current, waiter)) +			break; + +		/* +		 * TASK_INTERRUPTIBLE checks for signals and +		 * timeout. Ignored otherwise. +		 */ +		if (unlikely(state == TASK_INTERRUPTIBLE)) { +			/* Signal pending? */ +			if (signal_pending(current)) +				ret = -EINTR; +			if (timeout && !timeout->task) +				ret = -ETIMEDOUT; +			if (ret) +				break; +		} + +		raw_spin_unlock(&lock->wait_lock); + +		debug_rt_mutex_print_deadlock(waiter); + +		schedule_rt_mutex(lock); + +		raw_spin_lock(&lock->wait_lock); +		set_current_state(state); +	} + +	return ret; +} + +static void rt_mutex_handle_deadlock(int res, int detect_deadlock, +				     struct rt_mutex_waiter *w) +{ +	/* +	 * If the result is not -EDEADLOCK or the caller requested +	 * deadlock detection, nothing to do here. +	 */ +	if (res != -EDEADLOCK || detect_deadlock) +		return; + +	/* +	 * Yell lowdly and stop the task right here. +	 */ +	rt_mutex_print_deadlock(w); +	while (1) { +		set_current_state(TASK_INTERRUPTIBLE); +		schedule(); +	} +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, +		  struct hrtimer_sleeper *timeout, +		  int detect_deadlock) +{ +	struct rt_mutex_waiter waiter; +	int ret = 0; + +	debug_rt_mutex_init_waiter(&waiter); +	RB_CLEAR_NODE(&waiter.pi_tree_entry); +	RB_CLEAR_NODE(&waiter.tree_entry); + +	raw_spin_lock(&lock->wait_lock); + +	/* Try to acquire the lock again: */ +	if (try_to_take_rt_mutex(lock, current, NULL)) { +		raw_spin_unlock(&lock->wait_lock); +		return 0; +	} + +	set_current_state(state); + +	/* Setup the timer, when timeout != NULL */ +	if (unlikely(timeout)) { +		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); +		if (!hrtimer_active(&timeout->timer)) +			timeout->task = NULL; +	} + +	ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + +	if (likely(!ret)) +		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); + +	set_current_state(TASK_RUNNING); + +	if (unlikely(ret)) { +		remove_waiter(lock, &waiter); +		rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); +	} + +	/* +	 * try_to_take_rt_mutex() sets the waiter bit +	 * unconditionally. We might have to fix that up. +	 */ +	fixup_rt_mutex_waiters(lock); + +	raw_spin_unlock(&lock->wait_lock); + +	/* Remove pending timer: */ +	if (unlikely(timeout)) +		hrtimer_cancel(&timeout->timer); + +	debug_rt_mutex_free_waiter(&waiter); + +	return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock) +{ +	int ret = 0; + +	raw_spin_lock(&lock->wait_lock); + +	if (likely(rt_mutex_owner(lock) != current)) { + +		ret = try_to_take_rt_mutex(lock, current, NULL); +		/* +		 * try_to_take_rt_mutex() sets the lock waiters +		 * bit unconditionally. Clean this up. +		 */ +		fixup_rt_mutex_waiters(lock); +	} + +	raw_spin_unlock(&lock->wait_lock); + +	return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ +	raw_spin_lock(&lock->wait_lock); + +	debug_rt_mutex_unlock(lock); + +	rt_mutex_deadlock_account_unlock(current); + +	/* +	 * We must be careful here if the fast path is enabled. If we +	 * have no waiters queued we cannot set owner to NULL here +	 * because of: +	 * +	 * foo->lock->owner = NULL; +	 *			rtmutex_lock(foo->lock);   <- fast path +	 *			free = atomic_dec_and_test(foo->refcnt); +	 *			rtmutex_unlock(foo->lock); <- fast path +	 *			if (free) +	 *				kfree(foo); +	 * raw_spin_unlock(foo->lock->wait_lock); +	 * +	 * So for the fastpath enabled kernel: +	 * +	 * Nothing can set the waiters bit as long as we hold +	 * lock->wait_lock. So we do the following sequence: +	 * +	 *	owner = rt_mutex_owner(lock); +	 *	clear_rt_mutex_waiters(lock); +	 *	raw_spin_unlock(&lock->wait_lock); +	 *	if (cmpxchg(&lock->owner, owner, 0) == owner) +	 *		return; +	 *	goto retry; +	 * +	 * The fastpath disabled variant is simple as all access to +	 * lock->owner is serialized by lock->wait_lock: +	 * +	 *	lock->owner = NULL; +	 *	raw_spin_unlock(&lock->wait_lock); +	 */ +	while (!rt_mutex_has_waiters(lock)) { +		/* Drops lock->wait_lock ! */ +		if (unlock_rt_mutex_safe(lock) == true) +			return; +		/* Relock the rtmutex and try again */ +		raw_spin_lock(&lock->wait_lock); +	} + +	/* +	 * The wakeup next waiter path does not suffer from the above +	 * race. See the comments there. +	 */ +	wakeup_next_waiter(lock); + +	raw_spin_unlock(&lock->wait_lock); + +	/* Undo pi boosting if necessary: */ +	rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, +		  int detect_deadlock, +		  int (*slowfn)(struct rt_mutex *lock, int state, +				struct hrtimer_sleeper *timeout, +				int detect_deadlock)) +{ +	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { +		rt_mutex_deadlock_account_lock(lock, current); +		return 0; +	} else +		return slowfn(lock, state, NULL, detect_deadlock); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, +			struct hrtimer_sleeper *timeout, int detect_deadlock, +			int (*slowfn)(struct rt_mutex *lock, int state, +				      struct hrtimer_sleeper *timeout, +				      int detect_deadlock)) +{ +	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { +		rt_mutex_deadlock_account_lock(lock, current); +		return 0; +	} else +		return slowfn(lock, state, timeout, detect_deadlock); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, +		     int (*slowfn)(struct rt_mutex *lock)) +{ +	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { +		rt_mutex_deadlock_account_lock(lock, current); +		return 1; +	} +	return slowfn(lock); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, +		    void (*slowfn)(struct rt_mutex *lock)) +{ +	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) +		rt_mutex_deadlock_account_unlock(current); +	else +		slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ +	might_sleep(); + +	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: 		the rt_mutex to be locked + * @detect_deadlock:	deadlock detection on/off + * + * Returns: + *  0 		on success + * -EINTR 	when interrupted by a signal + * -EDEADLK	when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, +						 int detect_deadlock) +{ +	might_sleep(); + +	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, +				 detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_timed_lock - lock a rt_mutex interruptible + *			the timeout structure is provided + *			by the caller + * + * @lock: 		the rt_mutex to be locked + * @timeout:		timeout structure or NULL (no timeout) + * @detect_deadlock:	deadlock detection on/off + * + * Returns: + *  0 		on success + * -EINTR 	when interrupted by a signal + * -ETIMEDOUT	when the timeout expired + * -EDEADLK	when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, +		    int detect_deadlock) +{ +	might_sleep(); + +	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, +				       detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock:	the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ +	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ +	rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ +	WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES +	lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ +	lock->owner = NULL; +	raw_spin_lock_init(&lock->wait_lock); +	lock->waiters = RB_ROOT; +	lock->waiters_leftmost = NULL; + +	debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + *				proxy owner + * + * @lock: 	the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_init_proxy_locked(struct rt_mutex *lock, +				struct task_struct *proxy_owner) +{ +	__rt_mutex_init(lock, NULL); +	debug_rt_mutex_proxy_lock(lock, proxy_owner); +	rt_mutex_set_owner(lock, proxy_owner); +	rt_mutex_deadlock_account_lock(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: 	the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_proxy_unlock(struct rt_mutex *lock, +			   struct task_struct *proxy_owner) +{ +	debug_rt_mutex_proxy_unlock(lock); +	rt_mutex_set_owner(lock, NULL); +	rt_mutex_deadlock_account_unlock(proxy_owner); +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock:		the rt_mutex to take + * @waiter:		the pre-initialized rt_mutex_waiter + * @task:		the task to prepare + * @detect_deadlock:	perform deadlock detection (1) or not (0) + * + * Returns: + *  0 - task blocked on lock + *  1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +			      struct rt_mutex_waiter *waiter, +			      struct task_struct *task, int detect_deadlock) +{ +	int ret; + +	raw_spin_lock(&lock->wait_lock); + +	if (try_to_take_rt_mutex(lock, task, NULL)) { +		raw_spin_unlock(&lock->wait_lock); +		return 1; +	} + +	/* We enforce deadlock detection for futexes */ +	ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); + +	if (ret && !rt_mutex_owner(lock)) { +		/* +		 * Reset the return value. We might have +		 * returned with -EDEADLK and the owner +		 * released the lock while we were walking the +		 * pi chain.  Let the waiter sort it out. +		 */ +		ret = 0; +	} + +	if (unlikely(ret)) +		remove_waiter(lock, waiter); + +	raw_spin_unlock(&lock->wait_lock); + +	debug_rt_mutex_print_deadlock(waiter); + +	return ret; +} + +/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query + * + * Returns the next owner of the lock or NULL + * + * Caller has to serialize against other accessors to the lock + * itself. + * + * Special API call for PI-futex support + */ +struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) +{ +	if (!rt_mutex_has_waiters(lock)) +		return NULL; + +	return rt_mutex_top_waiter(lock)->task; +} + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock:		the rt_mutex we were woken on + * @to:			the timeout, null if none. hrtimer should already have + * 			been started. + * @waiter:		the pre-initialized rt_mutex_waiter + * @detect_deadlock:	perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + *  0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +			       struct hrtimer_sleeper *to, +			       struct rt_mutex_waiter *waiter, +			       int detect_deadlock) +{ +	int ret; + +	raw_spin_lock(&lock->wait_lock); + +	set_current_state(TASK_INTERRUPTIBLE); + +	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + +	set_current_state(TASK_RUNNING); + +	if (unlikely(ret)) +		remove_waiter(lock, waiter); + +	/* +	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might +	 * have to fix that up. +	 */ +	fixup_rt_mutex_waiters(lock); + +	raw_spin_unlock(&lock->wait_lock); + +	return ret; +} diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h new file mode 100644 index 00000000000..f6a1f3c133b --- /dev/null +++ b/kernel/locking/rtmutex.h @@ -0,0 +1,31 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define rt_mutex_deadlock_check(l)			(0) +#define rt_mutex_deadlock_account_lock(m, t)		do { } while (0) +#define rt_mutex_deadlock_account_unlock(l)		do { } while (0) +#define debug_rt_mutex_init_waiter(w)			do { } while (0) +#define debug_rt_mutex_free_waiter(w)			do { } while (0) +#define debug_rt_mutex_lock(l)				do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p)			do { } while (0) +#define debug_rt_mutex_proxy_unlock(l)			do { } while (0) +#define debug_rt_mutex_unlock(l)			do { } while (0) +#define debug_rt_mutex_init(m, n)			do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0) +#define debug_rt_mutex_print_deadlock(w)		do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d)		(d) +#define debug_rt_mutex_reset_waiter(w)			do { } while (0) + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ +	WARN(1, "rtmutex deadlock detected\n"); +} diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h new file mode 100644 index 00000000000..7431a9c86f3 --- /dev/null +++ b/kernel/locking/rtmutex_common.h @@ -0,0 +1,127 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include <linux/rtmutex.h> + +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of  complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock)				\ +  do {								\ +	if (!(current->flags & PF_MUTEX_TESTER))		\ +		schedule();					\ +	else							\ +		schedule_rt_mutex_test(_lock);			\ +  } while (0) + +#else +# define schedule_rt_mutex(_lock)			schedule() +#endif + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @tree_entry:		pi node to enqueue into the mutex waiters tree + * @pi_tree_entry:	pi node to enqueue into the mutex owner waiters tree + * @task:		task reference to the blocked task + */ +struct rt_mutex_waiter { +	struct rb_node          tree_entry; +	struct rb_node          pi_tree_entry; +	struct task_struct	*task; +	struct rt_mutex		*lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES +	unsigned long		ip; +	struct pid		*deadlock_task_pid; +	struct rt_mutex		*deadlock_lock; +#endif +	int prio; +}; + +/* + * Various helpers to access the waiters-tree: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ +	return !RB_EMPTY_ROOT(&lock->waiters); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ +	struct rt_mutex_waiter *w; + +	w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, +		     tree_entry); +	BUG_ON(w->lock != lock); + +	return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ +	return !RB_EMPTY_ROOT(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ +	return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, +			pi_tree_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_HAS_WAITERS	1UL +#define RT_MUTEX_OWNER_MASKALL	1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ +	return (struct task_struct *) +		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, +				       struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, +				  struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +				     struct rt_mutex_waiter *waiter, +				     struct task_struct *task, +				     int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +				      struct hrtimer_sleeper *to, +				      struct rt_mutex_waiter *waiter, +				      int detect_deadlock); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +#endif diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 00000000000..2c93571162c --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c @@ -0,0 +1,296 @@ +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for + * generic spinlock implementation + * + * Copyright (c) 2001   David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/export.h> + +enum rwsem_waiter_type { +	RWSEM_WAITING_FOR_WRITE, +	RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { +	struct list_head list; +	struct task_struct *task; +	enum rwsem_waiter_type type; +}; + +int rwsem_is_locked(struct rw_semaphore *sem) +{ +	int ret = 1; +	unsigned long flags; + +	if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { +		ret = (sem->count != 0); +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +	} +	return ret; +} +EXPORT_SYMBOL(rwsem_is_locked); + +/* + * initialise the semaphore + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, +		  struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held semaphore: +	 */ +	debug_check_no_locks_freed((void *)sem, sizeof(*sem)); +	lockdep_init_map(&sem->dep_map, name, key, 0); +#endif +	sem->count = 0; +	raw_spin_lock_init(&sem->wait_lock); +	INIT_LIST_HEAD(&sem->wait_list); +} +EXPORT_SYMBOL(__init_rwsem); + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here, then: + *   - the 'active count' _reached_ zero + *   - the 'waiting count' is non-zero + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +{ +	struct rwsem_waiter *waiter; +	struct task_struct *tsk; +	int woken; + +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + +	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { +		if (wakewrite) +			/* Wake up a writer. Note that we do not grant it the +			 * lock - it will have to acquire it when it runs. */ +			wake_up_process(waiter->task); +		goto out; +	} + +	/* grant an infinite number of read locks to the front of the queue */ +	woken = 0; +	do { +		struct list_head *next = waiter->list.next; + +		list_del(&waiter->list); +		tsk = waiter->task; +		smp_mb(); +		waiter->task = NULL; +		wake_up_process(tsk); +		put_task_struct(tsk); +		woken++; +		if (next == &sem->wait_list) +			break; +		waiter = list_entry(next, struct rwsem_waiter, list); +	} while (waiter->type != RWSEM_WAITING_FOR_WRITE); + +	sem->count += woken; + + out: +	return sem; +} + +/* + * wake a single writer + */ +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) +{ +	struct rwsem_waiter *waiter; + +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); +	wake_up_process(waiter->task); + +	return sem; +} + +/* + * get a read lock on the semaphore + */ +void __sched __down_read(struct rw_semaphore *sem) +{ +	struct rwsem_waiter waiter; +	struct task_struct *tsk; +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (sem->count >= 0 && list_empty(&sem->wait_list)) { +		/* granted */ +		sem->count++; +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +		goto out; +	} + +	tsk = current; +	set_task_state(tsk, TASK_UNINTERRUPTIBLE); + +	/* set up my own style of waitqueue */ +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_READ; +	get_task_struct(tsk); + +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we don't need to touch the semaphore struct anymore */ +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	/* wait to be given the lock */ +	for (;;) { +		if (!waiter.task) +			break; +		schedule(); +		set_task_state(tsk, TASK_UNINTERRUPTIBLE); +	} + +	tsk->state = TASK_RUNNING; + out: +	; +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int __down_read_trylock(struct rw_semaphore *sem) +{ +	unsigned long flags; +	int ret = 0; + + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (sem->count >= 0 && list_empty(&sem->wait_list)) { +		/* granted */ +		sem->count++; +		ret = 1; +	} + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return ret; +} + +/* + * get a write lock on the semaphore + */ +void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +{ +	struct rwsem_waiter waiter; +	struct task_struct *tsk; +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	/* set up my own style of waitqueue */ +	tsk = current; +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_WRITE; +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* wait for someone to release the lock */ +	for (;;) { +		/* +		 * That is the key to support write lock stealing: allows the +		 * task already on CPU to get the lock soon rather than put +		 * itself into sleep and waiting for system woke it or someone +		 * else in the head of the wait list up. +		 */ +		if (sem->count == 0) +			break; +		set_task_state(tsk, TASK_UNINTERRUPTIBLE); +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +		schedule(); +		raw_spin_lock_irqsave(&sem->wait_lock, flags); +	} +	/* got the lock */ +	sem->count = -1; +	list_del(&waiter.list); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +void __sched __down_write(struct rw_semaphore *sem) +{ +	__down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int __down_write_trylock(struct rw_semaphore *sem) +{ +	unsigned long flags; +	int ret = 0; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (sem->count == 0) { +		/* got the lock */ +		sem->count = -1; +		ret = 1; +	} + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return ret; +} + +/* + * release a read lock on the semaphore + */ +void __up_read(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (--sem->count == 0 && !list_empty(&sem->wait_list)) +		sem = __rwsem_wake_one_writer(sem); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * release a write lock on the semaphore + */ +void __up_write(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	sem->count = 0; +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, 1); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void __downgrade_write(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	sem->count = 1; +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, 0); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 00000000000..a2391ac135c --- /dev/null +++ b/kernel/locking/rwsem-xadd.c @@ -0,0 +1,513 @@ +/* rwsem.c: R/W semaphores: contention handling functions + * + * Written by David Howells (dhowells@redhat.com). + * Derived from arch/i386/kernel/semaphore.c + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/sched/rt.h> + +#include "mcs_spinlock.h" + +/* + * Guide to the rw_semaphore's count field for common values. + * (32-bit case illustrated, similar for 64-bit) + * + * 0x0000000X	(1) X readers active or attempting lock, no writer waiting + *		    X = #active_readers + #readers attempting to lock + *		    (X*ACTIVE_BIAS) + * + * 0x00000000	rwsem is unlocked, and no one is waiting for the lock or + *		attempting to read lock or write lock. + * + * 0xffff000X	(1) X readers active or attempting lock, with waiters for lock + *		    X = #active readers + # readers attempting lock + *		    (X*ACTIVE_BIAS + WAITING_BIAS) + *		(2) 1 writer attempting lock, no waiters for lock + *		    X-1 = #active readers + #readers attempting lock + *		    ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + *		(3) 1 writer active, no waiters for lock + *		    X-1 = #active readers + #readers attempting lock + *		    ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * + * 0xffff0001	(1) 1 reader active or attempting lock, waiters for lock + *		    (WAITING_BIAS + ACTIVE_BIAS) + *		(2) 1 writer active or attempting lock, no waiters for lock + *		    (ACTIVE_WRITE_BIAS) + * + * 0xffff0000	(1) There are writers or readers queued but none active + *		    or in the process of attempting lock. + *		    (WAITING_BIAS) + *		Note: writer can attempt to steal lock for this count by adding + *		ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * + * 0xfffe0001	(1) 1 writer active, or attempting lock. Waiters on queue. + *		    (ACTIVE_WRITE_BIAS + WAITING_BIAS) + * + * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking + *	 the count becomes more than 0 for successful lock acquisition, + *	 i.e. the case where there are only readers or nobody has lock. + *	 (1st and 2nd case above). + * + *	 Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and + *	 checking the count becomes ACTIVE_WRITE_BIAS for successful lock + *	 acquisition (i.e. nobody else has lock or attempts lock).  If + *	 unsuccessful, in rwsem_down_write_failed, we'll check to see if there + *	 are only waiters but none active (5th case above), and attempt to + *	 steal the lock. + * + */ + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, +		  struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held semaphore: +	 */ +	debug_check_no_locks_freed((void *)sem, sizeof(*sem)); +	lockdep_init_map(&sem->dep_map, name, key, 0); +#endif +	sem->count = RWSEM_UNLOCKED_VALUE; +	raw_spin_lock_init(&sem->wait_lock); +	INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +	sem->owner = NULL; +	osq_lock_init(&sem->osq); +#endif +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { +	RWSEM_WAITING_FOR_WRITE, +	RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { +	struct list_head list; +	struct task_struct *task; +	enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { +	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */ +	RWSEM_WAKE_READERS,	/* Wake readers only */ +	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then: + *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) + *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - there must be someone on the queue + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if downgrading is false + */ +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +{ +	struct rwsem_waiter *waiter; +	struct task_struct *tsk; +	struct list_head *next; +	long oldcount, woken, loop, adjustment; + +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); +	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { +		if (wake_type == RWSEM_WAKE_ANY) +			/* Wake writer at the front of the queue, but do not +			 * grant it the lock yet as we want other writers +			 * to be able to steal it.  Readers, on the other hand, +			 * will block as they will notice the queued writer. +			 */ +			wake_up_process(waiter->task); +		goto out; +	} + +	/* Writers might steal the lock before we grant it to the next reader. +	 * We prefer to do the first reader grant before counting readers +	 * so we can bail out early if a writer stole the lock. +	 */ +	adjustment = 0; +	if (wake_type != RWSEM_WAKE_READ_OWNED) { +		adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: +		oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; +		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { +			/* A writer stole the lock. Undo our reader grant. */ +			if (rwsem_atomic_update(-adjustment, sem) & +						RWSEM_ACTIVE_MASK) +				goto out; +			/* Last active locker left. Retry waking readers. */ +			goto try_reader_grant; +		} +	} + +	/* Grant an infinite number of read locks to the readers at the front +	 * of the queue.  Note we increment the 'active part' of the count by +	 * the number of readers before waking any processes up. +	 */ +	woken = 0; +	do { +		woken++; + +		if (waiter->list.next == &sem->wait_list) +			break; + +		waiter = list_entry(waiter->list.next, +					struct rwsem_waiter, list); + +	} while (waiter->type != RWSEM_WAITING_FOR_WRITE); + +	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; +	if (waiter->type != RWSEM_WAITING_FOR_WRITE) +		/* hit end of list above */ +		adjustment -= RWSEM_WAITING_BIAS; + +	if (adjustment) +		rwsem_atomic_add(adjustment, sem); + +	next = sem->wait_list.next; +	loop = woken; +	do { +		waiter = list_entry(next, struct rwsem_waiter, list); +		next = waiter->list.next; +		tsk = waiter->task; +		smp_mb(); +		waiter->task = NULL; +		wake_up_process(tsk); +		put_task_struct(tsk); +	} while (--loop); + +	sem->wait_list.next = next; +	next->prev = &sem->wait_list; + + out: +	return sem; +} + +/* + * Wait for the read lock to be granted + */ +__visible +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +{ +	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; +	struct rwsem_waiter waiter; +	struct task_struct *tsk = current; + +	/* set up my own style of waitqueue */ +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_READ; +	get_task_struct(tsk); + +	raw_spin_lock_irq(&sem->wait_lock); +	if (list_empty(&sem->wait_list)) +		adjustment += RWSEM_WAITING_BIAS; +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we're now waiting on the lock, but no longer actively locking */ +	count = rwsem_atomic_update(adjustment, sem); + +	/* If there are no active locks, wake the front queued process(es). +	 * +	 * If there are no writers and we are first in the queue, +	 * wake our own waiter to join the existing active readers ! +	 */ +	if (count == RWSEM_WAITING_BIAS || +	    (count > RWSEM_WAITING_BIAS && +	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + +	raw_spin_unlock_irq(&sem->wait_lock); + +	/* wait to be given the lock */ +	while (true) { +		set_task_state(tsk, TASK_UNINTERRUPTIBLE); +		if (!waiter.task) +			break; +		schedule(); +	} + +	tsk->state = TASK_RUNNING; + +	return sem; +} + +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ +	if (!(count & RWSEM_ACTIVE_MASK)) { +		/* try acquiring the write lock */ +		if (sem->count == RWSEM_WAITING_BIAS && +		    cmpxchg(&sem->count, RWSEM_WAITING_BIAS, +			    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { +			if (!list_is_singular(&sem->wait_list)) +				rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); +			return true; +		} +	} +	return false; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ +	long old, count = ACCESS_ONCE(sem->count); + +	while (true) { +		if (!(count == 0 || count == RWSEM_WAITING_BIAS)) +			return false; + +		old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); +		if (old == count) +			return true; + +		count = old; +	} +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ +	struct task_struct *owner; +	bool on_cpu = false; + +	if (need_resched()) +		return false; + +	rcu_read_lock(); +	owner = ACCESS_ONCE(sem->owner); +	if (owner) +		on_cpu = owner->on_cpu; +	rcu_read_unlock(); + +	/* +	 * If sem->owner is not set, yet we have just recently entered the +	 * slowpath, then there is a possibility reader(s) may have the lock. +	 * To be safe, avoid spinning in these situations. +	 */ +	return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, +				 struct task_struct *owner) +{ +	if (sem->owner != owner) +		return false; + +	/* +	 * Ensure we emit the owner->on_cpu, dereference _after_ checking +	 * sem->owner still matches owner, if that fails, owner might +	 * point to free()d memory, if it still matches, the rcu_read_lock() +	 * ensures the memory stays valid. +	 */ +	barrier(); + +	return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ +	rcu_read_lock(); +	while (owner_running(sem, owner)) { +		if (need_resched()) +			break; + +		arch_mutex_cpu_relax(); +	} +	rcu_read_unlock(); + +	/* +	 * We break out the loop above on need_resched() or when the +	 * owner changed, which is a sign for heavy contention. Return +	 * success only when sem->owner is NULL. +	 */ +	return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ +	struct task_struct *owner; +	bool taken = false; + +	preempt_disable(); + +	/* sem->wait_lock should not be held when doing optimistic spinning */ +	if (!rwsem_can_spin_on_owner(sem)) +		goto done; + +	if (!osq_lock(&sem->osq)) +		goto done; + +	while (true) { +		owner = ACCESS_ONCE(sem->owner); +		if (owner && !rwsem_spin_on_owner(sem, owner)) +			break; + +		/* wait_lock will be acquired if write_lock is obtained */ +		if (rwsem_try_write_lock_unqueued(sem)) { +			taken = true; +			break; +		} + +		/* +		 * When there's no owner, we might have preempted between the +		 * owner acquiring the lock and setting the owner field. If +		 * we're an RT task that will live-lock because we won't let +		 * the owner complete. +		 */ +		if (!owner && (need_resched() || rt_task(current))) +			break; + +		/* +		 * The cpu_relax() call is a compiler barrier which forces +		 * everything in this loop to be re-loaded. We don't need +		 * memory barriers as we'll eventually observe the right +		 * values at the cost of a few extra spins. +		 */ +		arch_mutex_cpu_relax(); +	} +	osq_unlock(&sem->osq); +done: +	preempt_enable(); +	return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ +	return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock + */ +__visible +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +{ +	long count; +	bool waiting = true; /* any queued threads before us */ +	struct rwsem_waiter waiter; + +	/* undo write bias from down_write operation, stop active locking */ +	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + +	/* do optimistic spinning and steal lock if possible */ +	if (rwsem_optimistic_spin(sem)) +		return sem; + +	/* +	 * Optimistic spinning failed, proceed to the slowpath +	 * and block until we can acquire the sem. +	 */ +	waiter.task = current; +	waiter.type = RWSEM_WAITING_FOR_WRITE; + +	raw_spin_lock_irq(&sem->wait_lock); + +	/* account for this before adding a new element to the list */ +	if (list_empty(&sem->wait_list)) +		waiting = false; + +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we're now waiting on the lock, but no longer actively locking */ +	if (waiting) { +		count = ACCESS_ONCE(sem->count); + +		/* +		 * If there were already threads queued before us and there are +		 * no active writers, the lock must be read owned; so we try to +		 * wake any read locks that were queued ahead of us. +		 */ +		if (count > RWSEM_WAITING_BIAS) +			sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + +	} else +		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + +	/* wait until we successfully acquire the lock */ +	set_current_state(TASK_UNINTERRUPTIBLE); +	while (true) { +		if (rwsem_try_write_lock(count, sem)) +			break; +		raw_spin_unlock_irq(&sem->wait_lock); + +		/* Block until there are no active lockers. */ +		do { +			schedule(); +			set_current_state(TASK_UNINTERRUPTIBLE); +		} while ((count = sem->count) & RWSEM_ACTIVE_MASK); + +		raw_spin_lock_irq(&sem->wait_lock); +	} +	__set_current_state(TASK_RUNNING); + +	list_del(&waiter.list); +	raw_spin_unlock_irq(&sem->wait_lock); + +	return sem; +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +__visible +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	/* do nothing if list empty */ +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +__visible +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	/* do nothing if list empty */ +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return sem; +} + +EXPORT_SYMBOL(rwsem_down_read_failed); +EXPORT_SYMBOL(rwsem_down_write_failed); +EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c new file mode 100644 index 00000000000..e2d3bc7f03b --- /dev/null +++ b/kernel/locking/rwsem.c @@ -0,0 +1,186 @@ +/* kernel/rwsem.c: R/W semaphores, public implementation + * + * Written by David Howells (dhowells@redhat.com). + * Derived from asm-i386/semaphore.h + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/export.h> +#include <linux/rwsem.h> + +#include <linux/atomic.h> + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +	sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +	sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + +/* + * lock for reading + */ +void __sched down_read(struct rw_semaphore *sem) +{ +	might_sleep(); +	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + +	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int down_read_trylock(struct rw_semaphore *sem) +{ +	int ret = __down_read_trylock(sem); + +	if (ret == 1) +		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); +	return ret; +} + +EXPORT_SYMBOL(down_read_trylock); + +/* + * lock for writing + */ +void __sched down_write(struct rw_semaphore *sem) +{ +	might_sleep(); +	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + +	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +	rwsem_set_owner(sem); +} + +EXPORT_SYMBOL(down_write); + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int down_write_trylock(struct rw_semaphore *sem) +{ +	int ret = __down_write_trylock(sem); + +	if (ret == 1) { +		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); +		rwsem_set_owner(sem); +	} + +	return ret; +} + +EXPORT_SYMBOL(down_write_trylock); + +/* + * release a read lock + */ +void up_read(struct rw_semaphore *sem) +{ +	rwsem_release(&sem->dep_map, 1, _RET_IP_); + +	__up_read(sem); +} + +EXPORT_SYMBOL(up_read); + +/* + * release a write lock + */ +void up_write(struct rw_semaphore *sem) +{ +	rwsem_release(&sem->dep_map, 1, _RET_IP_); + +	rwsem_clear_owner(sem); +	__up_write(sem); +} + +EXPORT_SYMBOL(up_write); + +/* + * downgrade write lock to read lock + */ +void downgrade_write(struct rw_semaphore *sem) +{ +	/* +	 * lockdep: a downgraded write will live on as a write +	 * dependency. +	 */ +	rwsem_clear_owner(sem); +	__downgrade_write(sem); +} + +EXPORT_SYMBOL(downgrade_write); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void down_read_nested(struct rw_semaphore *sem, int subclass) +{ +	might_sleep(); +	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + +	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read_nested); + +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ +	might_sleep(); +	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + +	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +	rwsem_set_owner(sem); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + +void down_read_non_owner(struct rw_semaphore *sem) +{ +	might_sleep(); + +	__down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + +void down_write_nested(struct rw_semaphore *sem, int subclass) +{ +	might_sleep(); +	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + +	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +	rwsem_set_owner(sem); +} + +EXPORT_SYMBOL(down_write_nested); + +void up_read_non_owner(struct rw_semaphore *sem) +{ +	__up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + +#endif + + diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c new file mode 100644 index 00000000000..6815171a4ff --- /dev/null +++ b/kernel/locking/semaphore.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * Distributed under the terms of the GNU GPL, version 2 + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock.  It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore.  If it's zero, there may be tasks waiting on the wait_list. + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> +#include <linux/ftrace.h> + +static noinline void __down(struct semaphore *sem); +static noinline int __down_interruptible(struct semaphore *sem); +static noinline int __down_killable(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore.  If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->lock, flags); +	if (likely(sem->count > 0)) +		sem->count--; +	else +		__down(sem); +	raw_spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_interruptible - acquire the semaphore unless interrupted + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore.  If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +int down_interruptible(struct semaphore *sem) +{ +	unsigned long flags; +	int result = 0; + +	raw_spin_lock_irqsave(&sem->lock, flags); +	if (likely(sem->count > 0)) +		sem->count--; +	else +		result = __down_interruptible(sem); +	raw_spin_unlock_irqrestore(&sem->lock, flags); + +	return result; +} +EXPORT_SYMBOL(down_interruptible); + +/** + * down_killable - acquire the semaphore unless killed + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore.  If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a fatal signal, this function will return + * -EINTR.  If the semaphore is successfully acquired, this function returns + * 0. + */ +int down_killable(struct semaphore *sem) +{ +	unsigned long flags; +	int result = 0; + +	raw_spin_lock_irqsave(&sem->lock, flags); +	if (likely(sem->count > 0)) +		sem->count--; +	else +		result = __down_killable(sem); +	raw_spin_unlock_irqrestore(&sem->lock, flags); + +	return result; +} +EXPORT_SYMBOL(down_killable); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock!  Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ +	unsigned long flags; +	int count; + +	raw_spin_lock_irqsave(&sem->lock, flags); +	count = sem->count - 1; +	if (likely(count >= 0)) +		sem->count = count; +	raw_spin_unlock_irqrestore(&sem->lock, flags); + +	return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @jiffies: how long to wait before failing + * + * Attempts to acquire the semaphore.  If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME.  It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long jiffies) +{ +	unsigned long flags; +	int result = 0; + +	raw_spin_lock_irqsave(&sem->lock, flags); +	if (likely(sem->count > 0)) +		sem->count--; +	else +		result = __down_timeout(sem, jiffies); +	raw_spin_unlock_irqrestore(&sem->lock, flags); + +	return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore.  Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->lock, flags); +	if (likely(list_empty(&sem->wait_list))) +		sem->count++; +	else +		__up(sem); +	raw_spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { +	struct list_head list; +	struct task_struct *task; +	bool up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler.  Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, +								long timeout) +{ +	struct task_struct *task = current; +	struct semaphore_waiter waiter; + +	list_add_tail(&waiter.list, &sem->wait_list); +	waiter.task = task; +	waiter.up = false; + +	for (;;) { +		if (signal_pending_state(state, task)) +			goto interrupted; +		if (unlikely(timeout <= 0)) +			goto timed_out; +		__set_task_state(task, state); +		raw_spin_unlock_irq(&sem->lock); +		timeout = schedule_timeout(timeout); +		raw_spin_lock_irq(&sem->lock); +		if (waiter.up) +			return 0; +	} + + timed_out: +	list_del(&waiter.list); +	return -ETIME; + + interrupted: +	list_del(&waiter.list); +	return -EINTR; +} + +static noinline void __sched __down(struct semaphore *sem) +{ +	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_interruptible(struct semaphore *sem) +{ +	return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_killable(struct semaphore *sem) +{ +	return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +{ +	return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); +} + +static noinline void __sched __up(struct semaphore *sem) +{ +	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, +						struct semaphore_waiter, list); +	list_del(&waiter->list); +	waiter->up = true; +	wake_up_process(waiter->task); +} diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c new file mode 100644 index 00000000000..4b082b5cac9 --- /dev/null +++ b/kernel/locking/spinlock.c @@ -0,0 +1,399 @@ +/* + * Copyright (2004) Linus Torvalds + * + * Author: Zwane Mwaikambo <zwane@fsmlabs.com> + * + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. + */ + +#include <linux/linkage.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include <linux/export.h> + +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +/* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +#define raw_read_can_lock(l)	read_can_lock(l) +#define raw_write_can_lock(l)	write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l)	cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l)	cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l)	cpu_relax() +#endif + +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + */ +#define BUILD_LOCK_OPS(op, locktype)					\ +void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\ +{									\ +	for (;;) {							\ +		preempt_disable();					\ +		if (likely(do_raw_##op##_trylock(lock)))		\ +			break;						\ +		preempt_enable();					\ +									\ +		if (!(lock)->break_lock)				\ +			(lock)->break_lock = 1;				\ +		while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ +			arch_##op##_relax(&lock->raw_lock);		\ +	}								\ +	(lock)->break_lock = 0;						\ +}									\ +									\ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\ +{									\ +	unsigned long flags;						\ +									\ +	for (;;) {							\ +		preempt_disable();					\ +		local_irq_save(flags);					\ +		if (likely(do_raw_##op##_trylock(lock)))		\ +			break;						\ +		local_irq_restore(flags);				\ +		preempt_enable();					\ +									\ +		if (!(lock)->break_lock)				\ +			(lock)->break_lock = 1;				\ +		while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ +			arch_##op##_relax(&lock->raw_lock);		\ +	}								\ +	(lock)->break_lock = 0;						\ +	return flags;							\ +}									\ +									\ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock)		\ +{									\ +	_raw_##op##_lock_irqsave(lock);					\ +}									\ +									\ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)		\ +{									\ +	unsigned long flags;						\ +									\ +	/*							*/	\ +	/* Careful: we must exclude softirqs too, hence the	*/	\ +	/* irq-disabling. We use the generic preemption-aware	*/	\ +	/* function:						*/	\ +	/**/								\ +	flags = _raw_##op##_lock_irqsave(lock);				\ +	local_bh_disable();						\ +	local_irq_restore(flags);					\ +}									\ + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + *         __[spin|read|write]_lock() + *         __[spin|read|write]_lock_irq() + *         __[spin|read|write]_lock_irqsave() + *         __[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, rwlock); +BUILD_LOCK_OPS(write, rwlock); + +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +{ +	return __raw_spin_trylock(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock); +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +{ +	return __raw_spin_trylock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock_bh); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +{ +	__raw_spin_lock(lock); +} +EXPORT_SYMBOL(_raw_spin_lock); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ +	return __raw_spin_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +{ +	__raw_spin_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +{ +	__raw_spin_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh); +#endif + +#ifdef CONFIG_UNINLINE_SPIN_UNLOCK +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +{ +	__raw_spin_unlock(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +{ +	__raw_spin_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +{ +	__raw_spin_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +{ +	__raw_spin_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _raw_read_trylock(rwlock_t *lock) +{ +	return __raw_read_trylock(lock); +} +EXPORT_SYMBOL(_raw_read_trylock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _raw_read_lock(rwlock_t *lock) +{ +	__raw_read_lock(lock); +} +EXPORT_SYMBOL(_raw_read_lock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +{ +	return __raw_read_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +{ +	__raw_read_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +{ +	__raw_read_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_lock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK +void __lockfunc _raw_read_unlock(rwlock_t *lock) +{ +	__raw_read_unlock(lock); +} +EXPORT_SYMBOL(_raw_read_unlock); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ +	__raw_read_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +{ +	__raw_read_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_BH +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +{ +	__raw_read_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _raw_write_trylock(rwlock_t *lock) +{ +	return __raw_write_trylock(lock); +} +EXPORT_SYMBOL(_raw_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _raw_write_lock(rwlock_t *lock) +{ +	__raw_write_lock(lock); +} +EXPORT_SYMBOL(_raw_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +{ +	return __raw_write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +{ +	__raw_write_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +{ +	__raw_write_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _raw_write_unlock(rwlock_t *lock) +{ +	__raw_write_unlock(lock); +} +EXPORT_SYMBOL(_raw_write_unlock); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ +	__raw_write_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +{ +	__raw_write_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +{ +	__raw_write_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_bh); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +{ +	preempt_disable(); +	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nested); + +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, +						   int subclass) +{ +	unsigned long flags; + +	local_irq_save(flags); +	preempt_disable(); +	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +	LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, +				do_raw_spin_lock_flags, &flags); +	return flags; +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); + +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, +				     struct lockdep_map *nest_lock) +{ +	preempt_disable(); +	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); +	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); + +#endif + +notrace int in_lock_functions(unsigned long addr) +{ +	/* Linker adds these: start and end of __lockfunc functions */ +	extern char __lock_text_start[], __lock_text_end[]; + +	return addr >= (unsigned long)__lock_text_start +	&& addr < (unsigned long)__lock_text_end; +} +EXPORT_SYMBOL(in_lock_functions); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 00000000000..0374a596cff --- /dev/null +++ b/kernel/locking/spinlock_debug.c @@ -0,0 +1,302 @@ +/* + * Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + * + * This file contains the spinlock/rwlock implementations for + * DEBUG_SPINLOCK. + */ + +#include <linux/spinlock.h> +#include <linux/nmi.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include <linux/delay.h> +#include <linux/export.h> + +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, +			  struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held lock: +	 */ +	debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +	lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +	lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +	lock->magic = SPINLOCK_MAGIC; +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__raw_spin_lock_init); + +void __rwlock_init(rwlock_t *lock, const char *name, +		   struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held lock: +	 */ +	debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +	lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +	lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; +	lock->magic = RWLOCK_MAGIC; +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__rwlock_init); + +static void spin_dump(raw_spinlock_t *lock, const char *msg) +{ +	struct task_struct *owner = NULL; + +	if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) +		owner = lock->owner; +	printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", +		msg, raw_smp_processor_id(), +		current->comm, task_pid_nr(current)); +	printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " +			".owner_cpu: %d\n", +		lock, lock->magic, +		owner ? owner->comm : "<none>", +		owner ? task_pid_nr(owner) : -1, +		lock->owner_cpu); +	dump_stack(); +} + +static void spin_bug(raw_spinlock_t *lock, const char *msg) +{ +	if (!debug_locks_off()) +		return; + +	spin_dump(lock, msg); +} + +#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + +static inline void +debug_spin_lock_before(raw_spinlock_t *lock) +{ +	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); +	SPIN_BUG_ON(lock->owner == current, lock, "recursion"); +	SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), +							lock, "cpu recursion"); +} + +static inline void debug_spin_lock_after(raw_spinlock_t *lock) +{ +	lock->owner_cpu = raw_smp_processor_id(); +	lock->owner = current; +} + +static inline void debug_spin_unlock(raw_spinlock_t *lock) +{ +	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); +	SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); +	SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); +	SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), +							lock, "wrong CPU"); +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +static void __spin_lock_debug(raw_spinlock_t *lock) +{ +	u64 i; +	u64 loops = loops_per_jiffy * HZ; + +	for (i = 0; i < loops; i++) { +		if (arch_spin_trylock(&lock->raw_lock)) +			return; +		__delay(1); +	} +	/* lockup suspected: */ +	spin_dump(lock, "lockup suspected"); +#ifdef CONFIG_SMP +	trigger_all_cpu_backtrace(); +#endif + +	/* +	 * The trylock above was causing a livelock.  Give the lower level arch +	 * specific lock code a chance to acquire the lock. We have already +	 * printed a warning/backtrace at this point. The non-debug arch +	 * specific code might actually succeed in acquiring the lock.  If it is +	 * not successful, the end-result is the same - there is no forward +	 * progress. +	 */ +	arch_spin_lock(&lock->raw_lock); +} + +void do_raw_spin_lock(raw_spinlock_t *lock) +{ +	debug_spin_lock_before(lock); +	if (unlikely(!arch_spin_trylock(&lock->raw_lock))) +		__spin_lock_debug(lock); +	debug_spin_lock_after(lock); +} + +int do_raw_spin_trylock(raw_spinlock_t *lock) +{ +	int ret = arch_spin_trylock(&lock->raw_lock); + +	if (ret) +		debug_spin_lock_after(lock); +#ifndef CONFIG_SMP +	/* +	 * Must not happen on UP: +	 */ +	SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif +	return ret; +} + +void do_raw_spin_unlock(raw_spinlock_t *lock) +{ +	debug_spin_unlock(lock); +	arch_spin_unlock(&lock->raw_lock); +} + +static void rwlock_bug(rwlock_t *lock, const char *msg) +{ +	if (!debug_locks_off()) +		return; + +	printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", +		msg, raw_smp_processor_id(), current->comm, +		task_pid_nr(current), lock); +	dump_stack(); +} + +#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +#if 0		/* __write_lock_debug() can lock up - maybe this can too? */ +static void __read_lock_debug(rwlock_t *lock) +{ +	u64 i; +	u64 loops = loops_per_jiffy * HZ; +	int print_once = 1; + +	for (;;) { +		for (i = 0; i < loops; i++) { +			if (arch_read_trylock(&lock->raw_lock)) +				return; +			__delay(1); +		} +		/* lockup suspected: */ +		if (print_once) { +			print_once = 0; +			printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " +					"%s/%d, %p\n", +				raw_smp_processor_id(), current->comm, +				current->pid, lock); +			dump_stack(); +		} +	} +} +#endif + +void do_raw_read_lock(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	arch_read_lock(&lock->raw_lock); +} + +int do_raw_read_trylock(rwlock_t *lock) +{ +	int ret = arch_read_trylock(&lock->raw_lock); + +#ifndef CONFIG_SMP +	/* +	 * Must not happen on UP: +	 */ +	RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif +	return ret; +} + +void do_raw_read_unlock(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	arch_read_unlock(&lock->raw_lock); +} + +static inline void debug_write_lock_before(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); +	RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), +							lock, "cpu recursion"); +} + +static inline void debug_write_lock_after(rwlock_t *lock) +{ +	lock->owner_cpu = raw_smp_processor_id(); +	lock->owner = current; +} + +static inline void debug_write_unlock(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); +	RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), +							lock, "wrong CPU"); +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +#if 0		/* This can cause lockups */ +static void __write_lock_debug(rwlock_t *lock) +{ +	u64 i; +	u64 loops = loops_per_jiffy * HZ; +	int print_once = 1; + +	for (;;) { +		for (i = 0; i < loops; i++) { +			if (arch_write_trylock(&lock->raw_lock)) +				return; +			__delay(1); +		} +		/* lockup suspected: */ +		if (print_once) { +			print_once = 0; +			printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " +					"%s/%d, %p\n", +				raw_smp_processor_id(), current->comm, +				current->pid, lock); +			dump_stack(); +		} +	} +} +#endif + +void do_raw_write_lock(rwlock_t *lock) +{ +	debug_write_lock_before(lock); +	arch_write_lock(&lock->raw_lock); +	debug_write_lock_after(lock); +} + +int do_raw_write_trylock(rwlock_t *lock) +{ +	int ret = arch_write_trylock(&lock->raw_lock); + +	if (ret) +		debug_write_lock_after(lock); +#ifndef CONFIG_SMP +	/* +	 * Must not happen on UP: +	 */ +	RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif +	return ret; +} + +void do_raw_write_unlock(rwlock_t *lock) +{ +	debug_write_unlock(lock); +	arch_write_unlock(&lock->raw_lock); +}  | 
