diff options
Diffstat (limited to 'kernel/trace')
27 files changed, 4528 insertions, 1449 deletions
| diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fc382d6e276..015f85aaca0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -71,6 +71,7 @@ config TRACE_CLOCK  config RING_BUFFER  	bool  	select TRACE_CLOCK +	select IRQ_WORK  config FTRACE_NMI_ENTER         bool @@ -107,7 +108,6 @@ config TRACING  	select BINARY_PRINTF  	select EVENT_TRACING  	select TRACE_CLOCK -	select IRQ_WORK  config GENERIC_TRACER  	bool @@ -176,6 +176,8 @@ config IRQSOFF_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in irqs-off critical  	  sections, with microsecond accuracy. @@ -198,6 +200,8 @@ config PREEMPT_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in preemption-off critical  	  sections, with microsecond accuracy. @@ -217,6 +221,7 @@ config SCHED_TRACER  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	select TRACER_MAX_TRACE +	select TRACER_SNAPSHOT  	help  	  This tracer tracks the latency of the highest priority task  	  to be scheduled in, starting from the point it has woken up. @@ -248,6 +253,27 @@ config TRACER_SNAPSHOT  	      echo 1 > /sys/kernel/debug/tracing/snapshot  	      cat snapshot +config TRACER_SNAPSHOT_PER_CPU_SWAP +        bool "Allow snapshot to swap per CPU" +	depends on TRACER_SNAPSHOT +	select RING_BUFFER_ALLOW_SWAP +	help +	  Allow doing a snapshot of a single CPU buffer instead of a +	  full swap (all buffers). If this is set, then the following is +	  allowed: + +	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + +	  After which, only the tracing buffer for CPU 2 was swapped with +	  the main tracing buffer, and the other CPU buffers remain the same. + +	  When this is enabled, this adds a little more overhead to the +	  trace recording, as it needs to add some checks to synchronize +	  recording with swaps. But this does not affect the performance +	  of the overall system. This is enabled by default when the preempt +	  or irq latency tracers are enabled, as those need to swap as well +	  and already adds the overhead (plus a lot more). +  config TRACE_BRANCH_PROFILING  	bool  	select GENERIC_TRACER @@ -524,6 +550,29 @@ config RING_BUFFER_BENCHMARK  	  If unsure, say N. +config RING_BUFFER_STARTUP_TEST +       bool "Ring buffer startup self test" +       depends on RING_BUFFER +       help +         Run a simple self test on the ring buffer on boot up. Late in the +	 kernel boot sequence, the test will start that kicks off +	 a thread per cpu. Each thread will write various size events +	 into the ring buffer. Another thread is created to send IPIs +	 to each of the threads, where the IPI handler will also write +	 to the ring buffer, to test/stress the nesting ability. +	 If any anomalies are discovered, a warning will be displayed +	 and all ring buffers will be disabled. + +	 The test runs for 10 seconds. This will slow your boot time +	 by at least 10 more seconds. + +	 At the end of the test, statics and more checks are done. +	 It will output the stats of each per cpu buffer. What +	 was written, the sizes, what was read, what was lost, and +	 other similar details. + +	 If unsure, say N +  endif # FTRACE  endif # TRACING_SUPPORT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272ee..b8b8560bfb9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,  	bool blk_tracer = blk_tracer_enabled;  	if (blk_tracer) { -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + len, @@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	if (blk_tracer) {  		tracing_record_cmdline(current); -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + pdu_len, @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,  				      struct request_queue *q,  				      struct request *rq)  { -	struct blk_trace *bt = q->blk_trace; - -	/* if control ever passes through here, it's a request based driver */ -	if (unlikely(bt && !bt->rq_based)) -		bt->rq_based = true; -  	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);  } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,  	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);  } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, +				       struct request_queue *q, struct bio *bio, +				       int error)  { -	struct request_queue *q; -	struct blk_trace *bt; - -	if (!bio->bi_bdev) -		return; - -	q = bdev_get_queue(bio->bi_bdev); -	bt = q->blk_trace; - -	/* -	 * Request based drivers will generate both rq and bio completions. -	 * Ignore bio ones. -	 */ -	if (likely(!bt) || bt->rq_based) -		return; -  	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);  } @@ -1828,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)  	rwbs[i] = '\0';  } +EXPORT_SYMBOL_GPL(blk_fill_rwbs);  #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf0..b549b0f5b97 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -64,9 +64,16 @@  #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_REGEX_LOCK(opsname)	\ +	.regex_lock	= __MUTEX_INITIALIZER(opsname.regex_lock), +#else +#define INIT_REGEX_LOCK(opsname) +#endif +  static struct ftrace_ops ftrace_list_end __read_mostly = {  	.func		= ftrace_stub, -	.flags		= FTRACE_OPS_FL_RECURSION_SAFE, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,  };  /* ftrace_enabled is a method to turn ftrace on or off */ @@ -131,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);  	while (likely(op = rcu_dereference_raw((op)->next)) &&	\  	       unlikely((op) != &ftrace_list_end)) +static inline void ftrace_ops_init(struct ftrace_ops *ops) +{ +#ifdef CONFIG_DYNAMIC_FTRACE +	if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { +		mutex_init(&ops->regex_lock); +		ops->flags |= FTRACE_OPS_FL_INITIALIZED; +	} +#endif +} +  /**   * ftrace_nr_registered_ops - return number of ops registered   * @@ -486,7 +503,6 @@ struct ftrace_profile_stat {  #define PROFILES_PER_PAGE					\  	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly;  static int ftrace_profile_enabled __read_mostly;  /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -494,7 +510,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);  static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)  static void *  function_stat_next(void *v, int idx) @@ -676,7 +693,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); -	for (i = 0; i < pages; i++) { +	for (i = 1; i < pages; i++) {  		pg->next = (void *)get_zeroed_page(GFP_KERNEL);  		if (!pg->next)  			goto out_free; @@ -694,7 +711,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  		free_page(tmp);  	} -	free_page((unsigned long)stat->pages);  	stat->pages = NULL;  	stat->start = NULL; @@ -725,13 +741,6 @@ static int ftrace_profile_init_cpu(int cpu)  	if (!stat->hash)  		return -ENOMEM; -	if (!ftrace_profile_bits) { -		size--; - -		for (; size; size >>= 1) -			ftrace_profile_bits++; -	} -  	/* Preallocate the function profiling pages */  	if (ftrace_profile_pages_init(stat) < 0) {  		kfree(stat->hash); @@ -764,7 +773,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)  	struct hlist_head *hhd;  	unsigned long key; -	key = hash_long(ip, ftrace_profile_bits); +	key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);  	hhd = &stat->hash[key];  	if (hlist_empty(hhd)) @@ -783,7 +792,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,  {  	unsigned long key; -	key = hash_long(rec->ip, ftrace_profile_bits); +	key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);  	hlist_add_head_rcu(&rec->node, &stat->hash[key]);  } @@ -915,7 +924,8 @@ static void unregister_ftrace_profiler(void)  #else  static struct ftrace_ops ftrace_profile_ops __read_mostly = {  	.func		= function_profile_call, -	.flags		= FTRACE_OPS_FL_RECURSION_SAFE, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(ftrace_profile_ops)  };  static int register_ftrace_profiler(void) @@ -1053,6 +1063,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)  static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ +	loff_t ret; + +	if (file->f_mode & FMODE_READ) +		ret = seq_lseek(file, offset, whence); +	else +		file->f_pos = ret = 1; + +	return ret; +} +  #ifdef CONFIG_DYNAMIC_FTRACE  #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1067,7 +1090,7 @@ struct ftrace_func_probe {  	unsigned long		flags;  	unsigned long		ip;  	void			*data; -	struct rcu_head		rcu; +	struct list_head	free_list;  };  struct ftrace_func_entry { @@ -1098,11 +1121,10 @@ static struct ftrace_ops global_ops = {  	.func			= ftrace_stub,  	.notrace_hash		= EMPTY_HASH,  	.filter_hash		= EMPTY_HASH, -	.flags			= FTRACE_OPS_FL_RECURSION_SAFE, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(global_ops)  }; -static DEFINE_MUTEX(ftrace_regex_lock); -  struct ftrace_page {  	struct ftrace_page	*next;  	struct dyn_ftrace	*records; @@ -1242,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)  void ftrace_free_filter(struct ftrace_ops *ops)  { +	ftrace_ops_init(ops);  	free_ftrace_hash(ops->filter_hash);  	free_ftrace_hash(ops->notrace_hash);  } @@ -1317,7 +1340,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	struct hlist_head *hhd;  	struct ftrace_hash *old_hash;  	struct ftrace_hash *new_hash; -	unsigned long key;  	int size = src->count;  	int bits = 0;  	int ret; @@ -1360,10 +1382,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	for (i = 0; i < size; i++) {  		hhd = &src->buckets[i];  		hlist_for_each_entry_safe(entry, tn, hhd, hlist) { -			if (bits > 0) -				key = hash_long(entry->ip, bits); -			else -				key = 0;  			remove_hash_entry(src, entry);  			__add_hash_entry(new_hash, entry);  		} @@ -2441,7 +2459,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  		     !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||  		    ((iter->flags & FTRACE_ITER_ENABLED) && -		     !(rec->flags & ~FTRACE_FL_MASK))) { +		     !(rec->flags & FTRACE_FL_ENABLED))) {  			rec = NULL;  			goto retry; @@ -2613,7 +2631,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)   * routine, you can use ftrace_filter_write() for the write   * routine if @flag has FTRACE_ITER_FILTER set, or   * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and   * release must call ftrace_regex_release().   */  int @@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,  	struct ftrace_hash *hash;  	int ret = 0; +	ftrace_ops_init(ops); +  	if (unlikely(ftrace_disabled))  		return -ENODEV; @@ -2636,28 +2656,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,  		return -ENOMEM;  	} +	iter->ops = ops; +	iter->flags = flag; + +	mutex_lock(&ops->regex_lock); +  	if (flag & FTRACE_ITER_NOTRACE)  		hash = ops->notrace_hash;  	else  		hash = ops->filter_hash; -	iter->ops = ops; -	iter->flags = flag; -  	if (file->f_mode & FMODE_WRITE) { -		mutex_lock(&ftrace_lock);  		iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); -		mutex_unlock(&ftrace_lock); -  		if (!iter->hash) {  			trace_parser_put(&iter->parser);  			kfree(iter); -			return -ENOMEM; +			ret = -ENOMEM; +			goto out_unlock;  		}  	} -	mutex_lock(&ftrace_regex_lock); -  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC))  		ftrace_filter_reset(iter->hash); @@ -2677,7 +2695,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,  		}  	} else  		file->private_data = iter; -	mutex_unlock(&ftrace_regex_lock); + + out_unlock: +	mutex_unlock(&ops->regex_lock);  	return ret;  } @@ -2697,19 +2717,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)  				 inode, file);  } -loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) -{ -	loff_t ret; - -	if (file->f_mode & FMODE_READ) -		ret = seq_lseek(file, offset, whence); -	else -		file->f_pos = ret = 1; - -	return ret; -} -  static int ftrace_match(char *str, char *regex, int len, int type)  {  	int matched = 0; @@ -2923,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,  static struct ftrace_ops trace_probe_ops __read_mostly =  {  	.func		= function_trace_probe_call, +	.flags		= FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(trace_probe_ops)  };  static int ftrace_probe_registered; @@ -2932,8 +2941,12 @@ static void __enable_ftrace_function_probe(void)  	int ret;  	int i; -	if (ftrace_probe_registered) +	if (ftrace_probe_registered) { +		/* still need to update the function call sites */ +		if (ftrace_enabled) +			ftrace_run_update_code(FTRACE_UPDATE_CALLS);  		return; +	}  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -2974,28 +2987,27 @@ static void __disable_ftrace_function_probe(void)  } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry)  { -	struct ftrace_func_probe *entry = -		container_of(rhp, struct ftrace_func_probe, rcu); -  	if (entry->ops->free) -		entry->ops->free(&entry->data); +		entry->ops->free(entry->ops, entry->ip, &entry->data);  	kfree(entry);  } -  int  register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  			      void *data)  {  	struct ftrace_func_probe *entry; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct ftrace_hash *hash;  	struct ftrace_page *pg;  	struct dyn_ftrace *rec;  	int type, len, not;  	unsigned long key;  	int count = 0;  	char *search; +	int ret;  	type = filter_parse_regex(glob, strlen(glob), &search, ¬);  	len = strlen(search); @@ -3004,10 +3016,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	if (WARN_ON(not))  		return -EINVAL; -	mutex_lock(&ftrace_lock); +	mutex_lock(&trace_probe_ops.regex_lock); -	if (unlikely(ftrace_disabled)) -		goto out_unlock; +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) { +		count = -ENOMEM; +		goto out; +	} + +	if (unlikely(ftrace_disabled)) { +		count = -ENODEV; +		goto out; +	} + +	mutex_lock(&ftrace_lock);  	do_for_each_ftrace_rec(pg, rec) { @@ -3031,14 +3053,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		 * for each function we find. We call the callback  		 * to give the caller an opportunity to do so.  		 */ -		if (ops->callback) { -			if (ops->callback(rec->ip, &entry->data) < 0) { +		if (ops->init) { +			if (ops->init(ops, rec->ip, &entry->data) < 0) {  				/* caller does not like this func */  				kfree(entry);  				continue;  			}  		} +		ret = enter_record(hash, rec, 0); +		if (ret < 0) { +			kfree(entry); +			count = ret; +			goto out_unlock; +		} +  		entry->ops = ops;  		entry->ip = rec->ip; @@ -3046,10 +3075,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);  	} while_for_each_ftrace_rec(); + +	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	if (ret < 0) +		count = ret; +  	__enable_ftrace_function_probe();   out_unlock:  	mutex_unlock(&ftrace_lock); + out: +	mutex_unlock(&trace_probe_ops.regex_lock); +	free_ftrace_hash(hash);  	return count;  } @@ -3063,7 +3100,12 @@ static void  __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  				  void *data, int flags)  { +	struct ftrace_func_entry *rec_entry;  	struct ftrace_func_probe *entry; +	struct ftrace_func_probe *p; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct list_head free_list; +	struct ftrace_hash *hash;  	struct hlist_node *tmp;  	char str[KSYM_SYMBOL_LEN];  	int type = MATCH_FULL; @@ -3083,7 +3125,15 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  			return;  	} -	mutex_lock(&ftrace_lock); +	mutex_lock(&trace_probe_ops.regex_lock); + +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) +		/* Hmm, should report this somehow */ +		goto out_unlock; + +	INIT_LIST_HEAD(&free_list); +  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3104,12 +3154,32 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  					continue;  			} +			rec_entry = ftrace_lookup_ip(hash, entry->ip); +			/* It is possible more than one entry had this ip */ +			if (rec_entry) +				free_hash_entry(hash, rec_entry); +  			hlist_del_rcu(&entry->node); -			call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); +			list_add(&entry->free_list, &free_list);  		}  	} +	mutex_lock(&ftrace_lock);  	__disable_ftrace_function_probe(); +	/* +	 * Remove after the disable is called. Otherwise, if the last +	 * probe is removed, a null hash means *all enabled*. +	 */ +	ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	synchronize_sched(); +	list_for_each_entry_safe(entry, p, &free_list, free_list) { +		list_del(&entry->free_list); +		ftrace_free_entry(entry); +	}  	mutex_unlock(&ftrace_lock); +		 + out_unlock: +	mutex_unlock(&trace_probe_ops.regex_lock); +	free_ftrace_hash(hash);  }  void @@ -3218,18 +3288,17 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  	if (!cnt)  		return 0; -	mutex_lock(&ftrace_regex_lock); - -	ret = -ENODEV; -	if (unlikely(ftrace_disabled)) -		goto out_unlock; -  	if (file->f_mode & FMODE_READ) {  		struct seq_file *m = file->private_data;  		iter = m->private;  	} else  		iter = file->private_data; +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	/* iter->hash is a local copy, so we don't need regex_lock */ +  	parser = &iter->parser;  	read = trace_get_user(parser, ubuf, cnt, ppos); @@ -3238,14 +3307,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  		ret = ftrace_process_regex(iter->hash, parser->buffer,  					   parser->idx, enable);  		trace_parser_clear(parser); -		if (ret) -			goto out_unlock; +		if (ret < 0) +			goto out;  	}  	ret = read; -out_unlock: -	mutex_unlock(&ftrace_regex_lock); - + out:  	return ret;  } @@ -3297,16 +3364,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,  	if (unlikely(ftrace_disabled))  		return -ENODEV; +	mutex_lock(&ops->regex_lock); +  	if (enable)  		orig_hash = &ops->filter_hash;  	else  		orig_hash = &ops->notrace_hash;  	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); -	if (!hash) -		return -ENOMEM; +	if (!hash) { +		ret = -ENOMEM; +		goto out_regex_unlock; +	} -	mutex_lock(&ftrace_regex_lock);  	if (reset)  		ftrace_filter_reset(hash);  	if (buf && !ftrace_match_records(hash, buf, len)) { @@ -3328,7 +3398,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,  	mutex_unlock(&ftrace_lock);   out_regex_unlock: -	mutex_unlock(&ftrace_regex_lock); +	mutex_unlock(&ops->regex_lock);  	free_ftrace_hash(hash);  	return ret; @@ -3354,6 +3424,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,  int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,  			 int remove, int reset)  { +	ftrace_ops_init(ops);  	return ftrace_set_addr(ops, ip, remove, reset, 1);  }  EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); @@ -3378,6 +3449,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,  int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,  		       int len, int reset)  { +	ftrace_ops_init(ops);  	return ftrace_set_regex(ops, buf, len, reset, 1);  }  EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3396,6 +3468,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);  int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,  			int len, int reset)  { +	ftrace_ops_init(ops);  	return ftrace_set_regex(ops, buf, len, reset, 0);  }  EXPORT_SYMBOL_GPL(ftrace_set_notrace); @@ -3441,14 +3514,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;  static int __init set_ftrace_notrace(char *str)  { -	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); +	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_notrace=", set_ftrace_notrace);  static int __init set_ftrace_filter(char *str)  { -	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); +	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_filter=", set_ftrace_filter); @@ -3486,6 +3559,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)  {  	char *func; +	ftrace_ops_init(ops); +  	while (buf) {  		func = strsep(&buf, ",");  		ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -3513,10 +3588,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)  	int filter_hash;  	int ret; -	mutex_lock(&ftrace_regex_lock);  	if (file->f_mode & FMODE_READ) {  		iter = m->private; -  		seq_release(inode, file);  	} else  		iter = file->private_data; @@ -3529,6 +3602,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)  	trace_parser_put(parser); +	mutex_lock(&iter->ops->regex_lock); +  	if (file->f_mode & FMODE_WRITE) {  		filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); @@ -3546,10 +3621,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)  		mutex_unlock(&ftrace_lock);  	} + +	mutex_unlock(&iter->ops->regex_lock);  	free_ftrace_hash(iter->hash);  	kfree(iter); -	mutex_unlock(&ftrace_regex_lock);  	return 0;  } @@ -3571,7 +3647,7 @@ static const struct file_operations ftrace_filter_fops = {  	.open = ftrace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -3579,7 +3655,7 @@ static const struct file_operations ftrace_notrace_fops = {  	.open = ftrace_notrace_open,  	.read = seq_read,  	.write = ftrace_notrace_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -3737,7 +3813,8 @@ out:  	if (fail)  		return -EINVAL; -	ftrace_graph_filter_enabled = 1; +	ftrace_graph_filter_enabled = !!(*idx); +  	return 0;  } @@ -3784,8 +3861,8 @@ static const struct file_operations ftrace_graph_fops = {  	.open		= ftrace_graph_open,  	.read		= seq_read,  	.write		= ftrace_graph_write, +	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_graph_release, -	.llseek		= seq_lseek,  };  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4087,7 +4164,8 @@ void __init ftrace_init(void)  static struct ftrace_ops global_ops = {  	.func			= ftrace_stub, -	.flags			= FTRACE_OPS_FL_RECURSION_SAFE, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(global_ops)  };  static int __init ftrace_nodyn_init(void) @@ -4131,7 +4209,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  	preempt_disable_notrace();  	trace_recursion_set(TRACE_CONTROL_BIT);  	do_for_each_ftrace_op(op, ftrace_control_list) { -		if (!ftrace_function_local_disabled(op) && +		if (!(op->flags & FTRACE_OPS_FL_STUB) && +		    !ftrace_function_local_disabled(op) &&  		    ftrace_ops_test(op, ip))  			op->func(ip, parent_ip, op, regs);  	} while_for_each_ftrace_op(op); @@ -4140,8 +4219,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  }  static struct ftrace_ops control_ops = { -	.func = ftrace_ops_control_func, -	.flags = FTRACE_OPS_FL_RECURSION_SAFE, +	.func	= ftrace_ops_control_func, +	.flags	= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(control_ops)  };  static inline void @@ -4439,7 +4519,7 @@ static const struct file_operations ftrace_pid_fops = {  	.open		= ftrace_pid_open,  	.write		= ftrace_pid_write,  	.read		= seq_read, -	.llseek		= seq_lseek, +	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_pid_release,  }; @@ -4499,6 +4579,8 @@ int register_ftrace_function(struct ftrace_ops *ops)  {  	int ret = -1; +	ftrace_ops_init(ops); +  	mutex_lock(&ftrace_lock);  	ret = __register_ftrace_function(ops); @@ -4555,12 +4637,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  		ftrace_startup_sysctl();  		/* we are starting ftrace again */ -		if (ftrace_ops_list != &ftrace_list_end) { -			if (ftrace_ops_list->next == &ftrace_list_end) -				ftrace_trace_function = ftrace_ops_list->func; -			else -				ftrace_trace_function = ftrace_ops_list_func; -		} +		if (ftrace_ops_list != &ftrace_list_end) +			update_ftrace_function();  	} else {  		/* stopping ftrace calls (just send to ftrace_stub) */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6989df2ba19..b59aea2c48c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,13 +8,16 @@  #include <linux/trace_clock.h>  #include <linux/trace_seq.h>  #include <linux/spinlock.h> +#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/hardirq.h> +#include <linux/kthread.h>	/* for self test */  #include <linux/kmemcheck.h>  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/mutex.h> +#include <linux/delay.h>  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/hash.h> @@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)  	return ret;  } +struct rb_irq_work { +	struct irq_work			work; +	wait_queue_head_t		waiters; +	bool				waiters_pending; +}; +  /*   * head_page == tail_page && head == tail then buffer is empty.   */ @@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {  	struct list_head		new_pages; /* new pages to add */  	struct work_struct		update_pages_work;  	struct completion		update_done; + +	struct rb_irq_work		irq_work;  };  struct ring_buffer { @@ -497,6 +508,8 @@ struct ring_buffer {  	struct notifier_block		cpu_notify;  #endif  	u64				(*clock)(void); + +	struct rb_irq_work		irq_work;  };  struct ring_buffer_iter { @@ -508,6 +521,118 @@ struct ring_buffer_iter {  	u64				read_stamp;  }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ +	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + +	wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	DEFINE_WAIT(wait); +	struct rb_irq_work *work; + +	/* +	 * Depending on what the caller is waiting for, either any +	 * data in any cpu buffer, or a specific buffer, put the +	 * caller on the appropriate wait queue. +	 */ +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + + +	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + +	/* +	 * The events can happen in critical sections where +	 * checking a work queue can cause deadlocks. +	 * After adding a task to the queue, this flag is set +	 * only to notify events to try to wake up the queue +	 * using irq_work. +	 * +	 * We don't clear it even if the buffer is no longer +	 * empty. The flag only causes the next event to run +	 * irq_work to do the work queue wake up. The worse +	 * that can happen if we race with !trace_empty() is that +	 * an event will cause an irq_work to try to wake up +	 * an empty queue. +	 * +	 * There's no reason to protect this flag either, as +	 * the work queue and irq_work logic will do the necessary +	 * synchronization for the wake ups. The only thing +	 * that is necessary is that the wake up happens after +	 * a task has been queued. It's OK for spurious wake ups. +	 */ +	work->waiters_pending = true; + +	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) +		schedule(); + +	finish_wait(&work->waiters, &wait); +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +			  struct file *filp, poll_table *poll_table) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct rb_irq_work *work; + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; + +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + +	work->waiters_pending = true; +	poll_wait(filp, &work->waiters, poll_table); + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; +	return 0; +} +  /* buffer may be either ring_buffer or ring_buffer_per_cpu */  #define RB_WARN_ON(b, cond)						\  	({								\ @@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)  	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);  	init_completion(&cpu_buffer->update_done); +	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&cpu_buffer->irq_work.waiters);  	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),  			    GFP_KERNEL, cpu_to_node(cpu)); @@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	buffer->clock = trace_clock_local;  	buffer->reader_lock_key = key; +	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&buffer->irq_work.waiters); +  	/* need at least two pages */  	if (nr_pages < 2)  		nr_pages = 2; @@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  			if (!cpu_buffer->nr_pages_to_update)  				continue; -			if (cpu_online(cpu)) +			/* The update must run on the CPU that is being updated. */ +			preempt_disable(); +			if (cpu == smp_processor_id() || !cpu_online(cpu)) { +				rb_update_pages(cpu_buffer); +				cpu_buffer->nr_pages_to_update = 0; +			} else { +				/* +				 * Can not disable preemption for schedule_work_on() +				 * on PREEMPT_RT. +				 */ +				preempt_enable();  				schedule_work_on(cpu,  						&cpu_buffer->update_pages_work); -			else -				rb_update_pages(cpu_buffer); +				preempt_disable(); +			} +			preempt_enable();  		}  		/* wait for all the updates to complete */ @@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  		get_online_cpus(); -		if (cpu_online(cpu_id)) { +		preempt_disable(); +		/* The update must run on the CPU that is being updated. */ +		if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) +			rb_update_pages(cpu_buffer); +		else { +			/* +			 * Can not disable preemption for schedule_work_on() +			 * on PREEMPT_RT. +			 */ +			preempt_enable();  			schedule_work_on(cpu_id,  					 &cpu_buffer->update_pages_work);  			wait_for_completion(&cpu_buffer->update_done); -		} else -			rb_update_pages(cpu_buffer); +			preempt_disable(); +		} +		preempt_enable();  		cpu_buffer->nr_pages_to_update = 0;  		put_online_cpus(); @@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,  	rb_end_commit(cpu_buffer);  } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ +	if (buffer->irq_work.waiters_pending) { +		buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&buffer->irq_work.work); +	} + +	if (cpu_buffer->irq_work.waiters_pending) { +		cpu_buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&cpu_buffer->irq_work.work); +	} +} +  /**   * ring_buffer_unlock_commit - commit a reserved   * @buffer: The buffer to commit to @@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	trace_recursive_unlock();  	preempt_enable_notrace(); @@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	ret = 0;   out:  	preempt_enable_notrace(); @@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  }  #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { +	struct ring_buffer	*buffer; +	unsigned long		events; +	unsigned long		bytes_written; +	unsigned long		bytes_alloc; +	unsigned long		bytes_dropped; +	unsigned long		events_nested; +	unsigned long		bytes_written_nested; +	unsigned long		bytes_alloc_nested; +	unsigned long		bytes_dropped_nested; +	int			min_size_nested; +	int			max_size_nested; +	int			max_size; +	int			min_size; +	int			cpu; +	int			cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE	1048576 + +static char rb_string[] __initdata = +	"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" +	"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" +	"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { +	int size; +	char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ +	struct ring_buffer_event *event; +	struct rb_item *item; +	bool started; +	int event_len; +	int size; +	int len; +	int cnt; + +	/* Have nested writes different that what is written */ +	cnt = data->cnt + (nested ? 27 : 0); + +	/* Multiply cnt by ~e, to make some unique increment */ +	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + +	len = size + sizeof(struct rb_item); + +	started = rb_test_started; +	/* read rb_test_started before checking buffer enabled */ +	smp_rmb(); + +	event = ring_buffer_lock_reserve(data->buffer, len); +	if (!event) { +		/* Ignore dropped events before test starts. */ +		if (started) { +			if (nested) +				data->bytes_dropped += len; +			else +				data->bytes_dropped_nested += len; +		} +		return len; +	} + +	event_len = ring_buffer_event_length(event); + +	if (RB_WARN_ON(data->buffer, event_len < len)) +		goto out; + +	item = ring_buffer_event_data(event); +	item->size = size; +	memcpy(item->str, rb_string, size); + +	if (nested) { +		data->bytes_alloc_nested += event_len; +		data->bytes_written_nested += len; +		data->events_nested++; +		if (!data->min_size_nested || len < data->min_size_nested) +			data->min_size_nested = len; +		if (len > data->max_size_nested) +			data->max_size_nested = len; +	} else { +		data->bytes_alloc += event_len; +		data->bytes_written += len; +		data->events++; +		if (!data->min_size || len < data->min_size) +			data->max_size = len; +		if (len > data->max_size) +			data->max_size = len; +	} + + out: +	ring_buffer_unlock_commit(data->buffer, event); + +	return 0; +} + +static __init int rb_test(void *arg) +{ +	struct rb_test_data *data = arg; + +	while (!kthread_should_stop()) { +		rb_write_something(data, false); +		data->cnt++; + +		set_current_state(TASK_INTERRUPTIBLE); +		/* Now sleep between a min of 100-300us and a max of 1ms */ +		usleep_range(((data->cnt % 3) + 1) * 100, 1000); +	} + +	return 0; +} + +static __init void rb_ipi(void *ignore) +{ +	struct rb_test_data *data; +	int cpu = smp_processor_id(); + +	data = &rb_data[cpu]; +	rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ +	while (!kthread_should_stop()) { + +		/* Send an IPI to all cpus to write data! */ +		smp_call_function(rb_ipi, NULL, 1); +		/* No sleep, but for non preempt, let others run */ +		schedule(); +	} + +	return 0; +} + +static __init int test_ringbuffer(void) +{ +	struct task_struct *rb_hammer; +	struct ring_buffer *buffer; +	int cpu; +	int ret = 0; + +	pr_info("Running ring buffer tests...\n"); + +	buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); +	if (WARN_ON(!buffer)) +		return 0; + +	/* Disable buffer so that threads can't write to it yet */ +	ring_buffer_record_off(buffer); + +	for_each_online_cpu(cpu) { +		rb_data[cpu].buffer = buffer; +		rb_data[cpu].cpu = cpu; +		rb_data[cpu].cnt = cpu; +		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], +						 "rbtester/%d", cpu); +		if (WARN_ON(!rb_threads[cpu])) { +			pr_cont("FAILED\n"); +			ret = -1; +			goto out_free; +		} + +		kthread_bind(rb_threads[cpu], cpu); + 		wake_up_process(rb_threads[cpu]); +	} + +	/* Now create the rb hammer! */ +	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); +	if (WARN_ON(!rb_hammer)) { +		pr_cont("FAILED\n"); +		ret = -1; +		goto out_free; +	} + +	ring_buffer_record_on(buffer); +	/* +	 * Show buffer is enabled before setting rb_test_started. +	 * Yes there's a small race window where events could be +	 * dropped and the thread wont catch it. But when a ring +	 * buffer gets enabled, there will always be some kind of +	 * delay before other CPUs see it. Thus, we don't care about +	 * those dropped events. We care about events dropped after +	 * the threads see that the buffer is active. +	 */ +	smp_wmb(); +	rb_test_started = true; + +	set_current_state(TASK_INTERRUPTIBLE); +	/* Just run for 10 seconds */; +	schedule_timeout(10 * HZ); + +	kthread_stop(rb_hammer); + + out_free: +	for_each_online_cpu(cpu) { +		if (!rb_threads[cpu]) +			break; +		kthread_stop(rb_threads[cpu]); +	} +	if (ret) { +		ring_buffer_free(buffer); +		return ret; +	} + +	/* Report! */ +	pr_info("finished\n"); +	for_each_online_cpu(cpu) { +		struct ring_buffer_event *event; +		struct rb_test_data *data = &rb_data[cpu]; +		struct rb_item *item; +		unsigned long total_events; +		unsigned long total_dropped; +		unsigned long total_written; +		unsigned long total_alloc; +		unsigned long total_read = 0; +		unsigned long total_size = 0; +		unsigned long total_len = 0; +		unsigned long total_lost = 0; +		unsigned long lost; +		int big_event_size; +		int small_event_size; + +		ret = -1; + +		total_events = data->events + data->events_nested; +		total_written = data->bytes_written + data->bytes_written_nested; +		total_alloc = data->bytes_alloc + data->bytes_alloc_nested; +		total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + +		big_event_size = data->max_size + data->max_size_nested; +		small_event_size = data->min_size + data->min_size_nested; + +		pr_info("CPU %d:\n", cpu); +		pr_info("              events:    %ld\n", total_events); +		pr_info("       dropped bytes:    %ld\n", total_dropped); +		pr_info("       alloced bytes:    %ld\n", total_alloc); +		pr_info("       written bytes:    %ld\n", total_written); +		pr_info("       biggest event:    %d\n", big_event_size); +		pr_info("      smallest event:    %d\n", small_event_size); + +		if (RB_WARN_ON(buffer, total_dropped)) +			break; + +		ret = 0; + +		while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { +			total_lost += lost; +			item = ring_buffer_event_data(event); +			total_len += ring_buffer_event_length(event); +			total_size += item->size + sizeof(struct rb_item); +			if (memcmp(&item->str[0], rb_string, item->size) != 0) { +				pr_info("FAILED!\n"); +				pr_info("buffer had: %.*s\n", item->size, item->str); +				pr_info("expected:   %.*s\n", item->size, rb_string); +				RB_WARN_ON(buffer, 1); +				ret = -1; +				break; +			} +			total_read++; +		} +		if (ret) +			break; + +		ret = -1; + +		pr_info("         read events:   %ld\n", total_read); +		pr_info("         lost events:   %ld\n", total_lost); +		pr_info("        total events:   %ld\n", total_lost + total_read); +		pr_info("  recorded len bytes:   %ld\n", total_len); +		pr_info(" recorded size bytes:   %ld\n", total_size); +		if (total_lost) +			pr_info(" With dropped events, record len and size may not match\n" +				" alloced and written from above\n"); +		if (!total_lost) { +			if (RB_WARN_ON(buffer, total_len != total_alloc || +				       total_size != total_written)) +				break; +		} +		if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) +			break; + +		ret = 0; +	} +	if (!ret) +		pr_info("Ring buffer PASSED!\n"); + +	ring_buffer_free(buffer); +	return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade5698..ae6fa2d1cdf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@  /*   * ring buffer based function tracer   * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>   * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>   *   * Originally taken from the RT patch by: @@ -19,7 +19,6 @@  #include <linux/seq_file.h>  #include <linux/notifier.h>  #include <linux/irqflags.h> -#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/pagemap.h>  #include <linux/hardirq.h> @@ -48,7 +47,7 @@   * On boot up, the ring buffer is set to the minimum size, so that   * we do not waste memory on systems that are not using tracing.   */ -int ring_buffer_expanded; +bool ring_buffer_expanded;  /*   * We need to change this state when a selftest is running. @@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)  static DEFINE_PER_CPU(bool, trace_cmdline_save);  /* - * When a reader is waiting for data, then this variable is - * set to true. - */ -static bool trace_wakeup_needed; - -static struct irq_work trace_work_wakeup; - -/*   * Kill all tracing for good (never come back).   * It is initialized to 1 but will turn to zero if the initialization   * of the tracer is successful. But that is the only place that sets @@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;  static char *default_bootup_tracer; +static bool allocate_snapshot; +  static int __init set_cmdline_ftrace(char *str)  { -	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); +	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);  	default_bootup_tracer = bootup_tracer_buf;  	/* We are using ftrace early, expand it */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	return 1;  }  __setup("ftrace=", set_cmdline_ftrace); @@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)  }  __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init boot_alloc_snapshot(char *str) +{ +	allocate_snapshot = true; +	/* We also need the main ring buffer expanded */ +	ring_buffer_expanded = true; +	return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); +  static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;  static char *trace_boot_options __initdata;  static int __init set_trace_boot_options(char *str)  { -	strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); +	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);  	trace_boot_options = trace_boot_options_buf;  	return 0;  } @@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)   */  static struct trace_array	global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays);  int filter_current_check_discard(struct ring_buffer *buffer,  				 struct ftrace_event_call *call, void *rec, @@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)  	u64 ts;  	/* Early boot up does not have a buffer yet */ -	if (!global_trace.buffer) +	if (!global_trace.trace_buffer.buffer)  		return trace_clock_local(); -	ts = ring_buffer_time_stamp(global_trace.buffer, cpu); -	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); +	ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); +	ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);  	return ts;  } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array	max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); -  int tracing_is_enabled(void)  {  	return tracing_is_on(); @@ -249,9 +237,6 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;  /* trace_types holds a link list of available tracers. */  static struct tracer		*trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer		*current_trace __read_mostly = &nop_trace; -  /*   * trace_types_lock is used to protect the trace_types list.   */ @@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);  static inline void trace_access_lock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		/* gain it for accessing the whole ring buffer. */  		down_write(&all_cpu_access_lock);  	} else {  		/* gain it for accessing a cpu ring buffer. */ -		/* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ +		/* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */  		down_read(&all_cpu_access_lock);  		/* Secondly block other access to this @cpu ring buffer. */ @@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)  static inline void trace_access_unlock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		up_write(&all_cpu_access_lock);  	} else {  		mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)  #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -  /* trace_flags holds trace_options default values */  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |  	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |  	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | -	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; - -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -static void trace_wake_up(struct irq_work *work) -{ -	wake_up_all(&trace_wait); - -} +	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;  /**   * tracing_on - enable tracing buffers @@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)   */  void tracing_on(void)  { -	if (global_trace.buffer) -		ring_buffer_record_on(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		ring_buffer_record_on(global_trace.trace_buffer.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race @@ -385,6 +351,196 @@ void tracing_on(void)  EXPORT_SYMBOL_GPL(tracing_on);  /** + * __trace_puts - write a constant string into the trace buffer. + * @ip:	   The address of the caller + * @str:   The constant string to write + * @size:  The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct print_entry *entry; +	unsigned long irq_flags; +	int alloc; + +	alloc = sizeof(*entry) + size + 2; /* possible \n added */ + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,  +					  irq_flags, preempt_count()); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip = ip; + +	memcpy(&entry->buf, str, size); + +	/* Add a newline if necessary */ +	if (entry->buf[size - 1] != '\n') { +		entry->buf[size] = '\n'; +		entry->buf[size + 1] = '\0'; +	} else +		entry->buf[size] = '\0'; + +	__buffer_unlock_commit(buffer, event); + +	return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip:	   The address of the caller + * @str:   The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct bputs_entry *entry; +	unsigned long irq_flags; +	int size = sizeof(struct bputs_entry); + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, +					  irq_flags, preempt_count()); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip			= ip; +	entry->str			= str; + +	__buffer_unlock_commit(buffer, event); + +	return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *tracer = tr->current_trace; +	unsigned long flags; + +	if (in_nmi()) { +		internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); +		internal_trace_puts("*** snapshot is being ignored        ***\n"); +		return; +	} + +	if (!tr->allocated_snapshot) { +		internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); +		internal_trace_puts("*** stopping trace here!   ***\n"); +		tracing_off(); +		return; +	} + +	/* Note, snapshot can not be used when the tracer uses it */ +	if (tracer->use_max_tr) { +		internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); +		internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); +		return; +	} + +	local_irq_save(flags); +	update_max_tr(tr, current, smp_processor_id()); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ +	int ret; + +	if (!tr->allocated_snapshot) { + +		/* allocate spare buffer */ +		ret = resize_buffer_duplicate_size(&tr->max_buffer, +				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS); +		if (ret < 0) +			return ret; + +		tr->allocated_snapshot = true; +	} + +	return 0; +} + +void free_snapshot(struct trace_array *tr) +{ +	/* +	 * We don't free the ring buffer. instead, resize it because +	 * The max_tr ring buffer has some state (e.g. ring->clock) and +	 * we want preserve it. +	 */ +	ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); +	set_buffer_entries(&tr->max_buffer, 1); +	tracing_reset_online_cpus(&tr->max_buffer); +	tr->allocated_snapshot = false; +} + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ +	struct trace_array *tr = &global_trace; +	int ret; + +	ret = alloc_snapshot(tr); +	if (WARN_ON(ret < 0)) +		return; + +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ +	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_alloc(void) +{ +	/* Give warning */ +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +/**   * tracing_off - turn off tracing buffers   *   * This function stops the tracing buffers from recording data. @@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);   */  void tracing_off(void)  { -	if (global_trace.buffer) -		ring_buffer_record_off(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		ring_buffer_record_off(global_trace.trace_buffer.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race @@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);   */  int tracing_is_on(void)  { -	if (global_trace.buffer) -		return ring_buffer_record_is_on(global_trace.buffer); +	if (global_trace.trace_buffer.buffer) +		return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);  	return !global_trace.buffer_disabled;  }  EXPORT_SYMBOL_GPL(tracing_is_on); @@ -479,6 +635,7 @@ static const char *trace_options[] = {  	"disable_on_free",  	"irq-info",  	"markers", +	"function-trace",  	NULL  }; @@ -490,6 +647,8 @@ static struct {  	{ trace_clock_local,	"local",	1 },  	{ trace_clock_global,	"global",	1 },  	{ trace_clock_counter,	"counter",	0 }, +	{ trace_clock_jiffies,	"uptime",	1 }, +	{ trace_clock,		"perf",		1 },  	ARCH_TRACE_CLOCKS  }; @@ -670,13 +829,14 @@ unsigned long __read_mostly	tracing_max_latency;  static void  __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct trace_array_cpu *data = tr->data[cpu]; -	struct trace_array_cpu *max_data; +	struct trace_buffer *trace_buf = &tr->trace_buffer; +	struct trace_buffer *max_buf = &tr->max_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); +	struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); -	max_tr.cpu = cpu; -	max_tr.time_start = data->preempt_timestamp; +	max_buf->cpu = cpu; +	max_buf->time_start = data->preempt_timestamp; -	max_data = max_tr.data[cpu];  	max_data->saved_latency = tracing_max_latency;  	max_data->critical_start = data->critical_start;  	max_data->critical_end = data->critical_end; @@ -706,22 +866,22 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  {  	struct ring_buffer *buf; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->allocated_snapshot) { +	if (!tr->allocated_snapshot) {  		/* Only the nop tracer should hit this when disabling */ -		WARN_ON_ONCE(current_trace != &nop_trace); +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return;  	}  	arch_spin_lock(&ftrace_max_lock); -	buf = tr->buffer; -	tr->buffer = max_tr.buffer; -	max_tr.buffer = buf; +	buf = tr->trace_buffer.buffer; +	tr->trace_buffer.buffer = tr->max_buffer.buffer; +	tr->max_buffer.buffer = buf;  	__update_max_tr(tr, tsk, cpu);  	arch_spin_unlock(&ftrace_max_lock); @@ -740,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  {  	int ret; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) +	if (!tr->allocated_snapshot) { +		/* Only the nop tracer should hit this when disabling */ +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return; +	}  	arch_spin_lock(&ftrace_max_lock); -	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); +	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);  	if (ret == -EBUSY) {  		/* @@ -758,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  		 * the max trace buffer (no one writes directly to it)  		 * and flag that it failed.  		 */ -		trace_array_printk(&max_tr, _THIS_IP_, +		trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,  			"Failed to swap buffers due to commit in progress\n");  	} @@ -771,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  static void default_wait_pipe(struct trace_iterator *iter)  { -	DEFINE_WAIT(wait); +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return; + +	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *saved_tracer = tr->current_trace; +	int ret; -	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); +	if (!type->selftest || tracing_selftest_disabled) +		return 0;  	/* -	 * The events can happen in critical sections where -	 * checking a work queue can cause deadlocks. -	 * After adding a task to the queue, this flag is set -	 * only to notify events to try to wake up the queue -	 * using irq_work. -	 * -	 * We don't clear it even if the buffer is no longer -	 * empty. The flag only causes the next event to run -	 * irq_work to do the work queue wake up. The worse -	 * that can happen if we race with !trace_empty() is that -	 * an event will cause an irq_work to try to wake up -	 * an empty queue. -	 * -	 * There's no reason to protect this flag either, as -	 * the work queue and irq_work logic will do the necessary -	 * synchronization for the wake ups. The only thing -	 * that is necessary is that the wake up happens after -	 * a task has been queued. It's OK for spurious wake ups. +	 * Run a selftest on this tracer. +	 * Here we reset the trace buffer, and set the current +	 * tracer to be this tracer. The tracer can then run some +	 * internal tracing to verify that everything is in order. +	 * If we fail, we do not register this tracer.  	 */ -	trace_wakeup_needed = true; +	tracing_reset_online_cpus(&tr->trace_buffer); -	if (trace_empty(iter)) -		schedule(); +	tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		/* If we expanded the buffers, make sure the max is expanded too */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, +					   RING_BUFFER_ALL_CPUS); +		tr->allocated_snapshot = true; +	} +#endif -	finish_wait(&trace_wait, &wait); +	/* the test is responsible for initializing and enabling */ +	pr_info("Testing tracer %s: ", type->name); +	ret = type->selftest(type, tr); +	/* the test is responsible for resetting too */ +	tr->current_trace = saved_tracer; +	if (ret) { +		printk(KERN_CONT "FAILED!\n"); +		/* Add the warning after printing 'FAILED' */ +		WARN_ON(1); +		return -1; +	} +	/* Only reset on passing, to avoid touching corrupted buffers */ +	tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		tr->allocated_snapshot = false; + +		/* Shrink the max buffer again */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, 1, +					   RING_BUFFER_ALL_CPUS); +	} +#endif + +	printk(KERN_CONT "PASSED\n"); +	return 0;  } +#else +static inline int run_tracer_selftest(struct tracer *type) +{ +	return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */  /**   * register_tracer - register a tracer with the ftrace system. @@ -848,57 +1052,9 @@ int register_tracer(struct tracer *type)  	if (!type->wait_pipe)  		type->wait_pipe = default_wait_pipe; - -#ifdef CONFIG_FTRACE_STARTUP_TEST -	if (type->selftest && !tracing_selftest_disabled) { -		struct tracer *saved_tracer = current_trace; -		struct trace_array *tr = &global_trace; - -		/* -		 * Run a selftest on this tracer. -		 * Here we reset the trace buffer, and set the current -		 * tracer to be this tracer. The tracer can then run some -		 * internal tracing to verify that everything is in order. -		 * If we fail, we do not register this tracer. -		 */ -		tracing_reset_online_cpus(tr); - -		current_trace = type; - -		if (type->use_max_tr) { -			/* If we expanded the buffers, make sure the max is expanded too */ -			if (ring_buffer_expanded) -				ring_buffer_resize(max_tr.buffer, trace_buf_size, -						   RING_BUFFER_ALL_CPUS); -			type->allocated_snapshot = true; -		} - -		/* the test is responsible for initializing and enabling */ -		pr_info("Testing tracer %s: ", type->name); -		ret = type->selftest(type, tr); -		/* the test is responsible for resetting too */ -		current_trace = saved_tracer; -		if (ret) { -			printk(KERN_CONT "FAILED!\n"); -			/* Add the warning after printing 'FAILED' */ -			WARN_ON(1); -			goto out; -		} -		/* Only reset on passing, to avoid touching corrupted buffers */ -		tracing_reset_online_cpus(tr); - -		if (type->use_max_tr) { -			type->allocated_snapshot = false; - -			/* Shrink the max buffer again */ -			if (ring_buffer_expanded) -				ring_buffer_resize(max_tr.buffer, 1, -						   RING_BUFFER_ALL_CPUS); -		} - -		printk(KERN_CONT "PASSED\n"); -	} -#endif +	ret = run_tracer_selftest(type); +	if (ret < 0) +		goto out;  	type->next = trace_types;  	trace_types = type; @@ -918,7 +1074,7 @@ int register_tracer(struct tracer *type)  	tracing_set_tracer(type->name);  	default_bootup_tracer = NULL;  	/* disable other selftests, since this will break it. */ -	tracing_selftest_disabled = 1; +	tracing_selftest_disabled = true;  #ifdef CONFIG_FTRACE_STARTUP_TEST  	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",  	       type->name); @@ -928,9 +1084,9 @@ int register_tracer(struct tracer *type)  	return ret;  } -void tracing_reset(struct trace_array *tr, int cpu) +void tracing_reset(struct trace_buffer *buf, int cpu)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	if (!buffer)  		return; @@ -944,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)  	ring_buffer_record_enable(buffer);  } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	int cpu;  	if (!buffer) @@ -957,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)  	/* Make sure all commits have finished */  	synchronize_sched(); -	tr->time_start = ftrace_now(tr->cpu); +	buf->time_start = ftrace_now(buf->cpu);  	for_each_online_cpu(cpu)  		ring_buffer_reset_cpu(buffer, cpu); @@ -967,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)  void tracing_reset_current(int cpu)  { -	tracing_reset(&global_trace, cpu); +	tracing_reset(&global_trace.trace_buffer, cpu);  } -void tracing_reset_current_online_cpus(void) +void tracing_reset_all_online_cpus(void)  { -	tracing_reset_online_cpus(&global_trace); +	struct trace_array *tr; + +	mutex_lock(&trace_types_lock); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE +		tracing_reset_online_cpus(&tr->max_buffer); +#endif +	} +	mutex_unlock(&trace_types_lock);  }  #define SAVED_CMDLINES 128 @@ -995,7 +1160,7 @@ static void trace_init_cmdlines(void)  int is_tracing_stopped(void)  { -	return trace_stop_count; +	return global_trace.stop_count;  }  /** @@ -1027,12 +1192,12 @@ void tracing_start(void)  	if (tracing_disabled)  		return; -	raw_spin_lock_irqsave(&tracing_start_lock, flags); -	if (--trace_stop_count) { -		if (trace_stop_count < 0) { +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (--global_trace.stop_count) { +		if (global_trace.stop_count < 0) {  			/* Someone screwed up their debugging */  			WARN_ON_ONCE(1); -			trace_stop_count = 0; +			global_trace.stop_count = 0;  		}  		goto out;  	} @@ -1040,19 +1205,52 @@ void tracing_start(void)  	/* Prevent the buffers from switching */  	arch_spin_lock(&ftrace_max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); +#endif  	arch_spin_unlock(&ftrace_max_lock);  	ftrace_start();   out: -	raw_spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	if (tracing_disabled) +		return; + +	/* If global, we need to also start the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_start(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); + +	if (--tr->stop_count) { +		if (tr->stop_count < 0) { +			/* Someone screwed up their debugging */ +			WARN_ON_ONCE(1); +			tr->stop_count = 0; +		} +		goto out; +	} + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_enable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  /** @@ -1067,25 +1265,48 @@ void tracing_stop(void)  	unsigned long flags;  	ftrace_stop(); -	raw_spin_lock_irqsave(&tracing_start_lock, flags); -	if (trace_stop_count++) +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (global_trace.stop_count++)  		goto out;  	/* Prevent the buffers from switching */  	arch_spin_lock(&ftrace_max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); +#endif  	arch_spin_unlock(&ftrace_max_lock);   out: -	raw_spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	/* If global, we need to also stop the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_stop(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); +	if (tr->stop_count++) +		goto out; + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_disable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  void trace_stop_cmdline_recording(void); @@ -1218,11 +1439,6 @@ void  __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)  {  	__this_cpu_write(trace_cmdline_save, true); -	if (trace_wakeup_needed) { -		trace_wakeup_needed = false; -		/* irq_work_queue() supplies it's own memory barriers */ -		irq_work_queue(&trace_work_wakeup); -	}  	ring_buffer_unlock_commit(buffer, event);  } @@ -1246,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,  EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);  struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, +			  struct ftrace_event_file *ftrace_file, +			  int type, unsigned long len, +			  unsigned long flags, int pc) +{ +	*current_rb = ftrace_file->tr->trace_buffer.buffer; +	return trace_buffer_lock_reserve(*current_rb, +					 type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +struct ring_buffer_event *  trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,  				  int type, unsigned long len,  				  unsigned long flags, int pc)  { -	*current_rb = global_trace.buffer; +	*current_rb = global_trace.trace_buffer.buffer;  	return trace_buffer_lock_reserve(*current_rb,  					 type, len, flags, pc);  } @@ -1289,7 +1517,7 @@ trace_function(struct trace_array *tr,  	       int pc)  {  	struct ftrace_event_call *call = &event_function; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ftrace_entry *entry; @@ -1430,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc)  { -	__ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); +	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);  }  /**   * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers)   */ -void trace_dump_stack(void) +void trace_dump_stack(int skip)  {  	unsigned long flags; @@ -1445,8 +1674,13 @@ void trace_dump_stack(void)  	local_save_flags(flags); -	/* skipping 3 traces, seems to get us at the caller of this function */ -	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); +	/* +	 * Skip 3 more, seems to get us at the caller of +	 * this function. +	 */ +	skip += 3; +	__ftrace_trace_stack(global_trace.trace_buffer.buffer, +			     flags, skip, preempt_count(), NULL);  }  static DEFINE_PER_CPU(int, user_stack_count); @@ -1616,7 +1850,7 @@ void trace_printk_init_buffers(void)  	 * directly here. If the global_trace.buffer is already  	 * allocated here, then this was called by module code.  	 */ -	if (global_trace.buffer) +	if (global_trace.trace_buffer.buffer)  		tracing_start_cmdline_record();  } @@ -1676,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	local_save_flags(flags);  	size = sizeof(*entry) + sizeof(u32) * len; -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,  					  flags, pc);  	if (!event) @@ -1699,27 +1933,12 @@ out:  }  EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, -		       unsigned long ip, const char *fmt, ...) -{ -	int ret; -	va_list ap; - -	if (!(trace_flags & TRACE_ITER_PRINTK)) -		return 0; - -	va_start(ap, fmt); -	ret = trace_array_vprintk(tr, ip, fmt, ap); -	va_end(ap); -	return ret; -} - -int trace_array_vprintk(struct trace_array *tr, -			unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, +		      unsigned long ip, const char *fmt, va_list args)  {  	struct ftrace_event_call *call = &event_print;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer;  	int len = 0, size, pc;  	struct print_entry *entry;  	unsigned long flags; @@ -1747,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,  	local_save_flags(flags);  	size = sizeof(*entry) + len + 1; -	buffer = tr->buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  flags, pc);  	if (!event) @@ -1768,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,  	return len;  } +int trace_array_vprintk(struct trace_array *tr, +			unsigned long ip, const char *fmt, va_list args) +{ +	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, +		       unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = trace_array_vprintk(tr, ip, fmt, ap); +	va_end(ap); +	return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = __trace_array_vprintk(buffer, ip, fmt, ap); +	va_end(ap); +	return ret; +} +  int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  {  	return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1793,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,  	if (buf_iter)  		event = ring_buffer_iter_peek(buf_iter, ts);  	else -		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, +		event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,  					 lost_events);  	if (event) { @@ -1808,7 +2062,7 @@ static struct trace_entry *  __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  		  unsigned long *missing_events, u64 *ent_ts)  { -	struct ring_buffer *buffer = iter->tr->buffer; +	struct ring_buffer *buffer = iter->trace_buffer->buffer;  	struct trace_entry *ent, *next = NULL;  	unsigned long lost_events = 0, next_lost = 0;  	int cpu_file = iter->cpu_file; @@ -1821,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  	 * If we are in a per_cpu trace file, don't bother by iterating over  	 * all cpu and peek directly.  	 */ -	if (cpu_file > TRACE_PIPE_ALL_CPU) { +	if (cpu_file > RING_BUFFER_ALL_CPUS) {  		if (ring_buffer_empty_cpu(buffer, cpu_file))  			return NULL;  		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1885,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)  static void trace_consume(struct trace_iterator *iter)  { -	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, +	ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,  			    &iter->lost_events);  } @@ -1918,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)  void tracing_iter_reset(struct trace_iterator *iter, int cpu)  { -	struct trace_array *tr = iter->tr;  	struct ring_buffer_event *event;  	struct ring_buffer_iter *buf_iter;  	unsigned long entries = 0;  	u64 ts; -	tr->data[cpu]->skipped_entries = 0; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;  	buf_iter = trace_buffer_iter(iter, cpu);  	if (!buf_iter) @@ -1938,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  	 * by the timestamp being before the start of the buffer.  	 */  	while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { -		if (ts >= iter->tr->time_start) +		if (ts >= iter->trace_buffer->time_start)  			break;  		entries++;  		ring_buffer_read(buf_iter, NULL);  	} -	tr->data[cpu]->skipped_entries = entries; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;  }  /* @@ -1954,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  static void *s_start(struct seq_file *m, loff_t *pos)  {  	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr;  	int cpu_file = iter->cpu_file;  	void *p = NULL;  	loff_t l = 0; @@ -1966,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)  	 * will point to the same string as current_trace->name.  	 */  	mutex_lock(&trace_types_lock); -	if (unlikely(current_trace && iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock); +#ifdef CONFIG_TRACER_MAX_TRACE  	if (iter->snapshot && iter->trace->use_max_tr)  		return ERR_PTR(-EBUSY); +#endif  	if (!iter->snapshot)  		atomic_inc(&trace_record_cmdline_disabled); @@ -1981,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)  		iter->cpu = 0;  		iter->idx = -1; -		if (cpu_file == TRACE_PIPE_ALL_CPU) { +		if (cpu_file == RING_BUFFER_ALL_CPUS) {  			for_each_tracing_cpu(cpu)  				tracing_iter_reset(iter, cpu);  		} else @@ -2013,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)  {  	struct trace_iterator *iter = m->private; +#ifdef CONFIG_TRACER_MAX_TRACE  	if (iter->snapshot && iter->trace->use_max_tr)  		return; +#endif  	if (!iter->snapshot)  		atomic_dec(&trace_record_cmdline_disabled); +  	trace_access_unlock(iter->cpu_file);  	trace_event_read_unlock();  }  static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, +		  unsigned long *total, unsigned long *entries)  {  	unsigned long count;  	int cpu; @@ -2032,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e  	*entries = 0;  	for_each_tracing_cpu(cpu) { -		count = ring_buffer_entries_cpu(tr->buffer, cpu); +		count = ring_buffer_entries_cpu(buf->buffer, cpu);  		/*  		 * If this buffer has skipped entries, then we hold all  		 * entries for the trace and we need to ignore the  		 * ones before the time stamp.  		 */ -		if (tr->data[cpu]->skipped_entries) { -			count -= tr->data[cpu]->skipped_entries; +		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { +			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;  			/* total is the same as the entries */  			*total += count;  		} else  			*total += count + -				ring_buffer_overrun_cpu(tr->buffer, cpu); +				ring_buffer_overrun_cpu(buf->buffer, cpu);  		*entries += count;  	}  } @@ -2061,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)  	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");  } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m)  {  	unsigned long total;  	unsigned long entries; -	get_total_entries(tr, &total, &entries); +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",  		   entries, total, num_online_cpus());  	seq_puts(m, "#\n");  } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)  { -	print_event_info(tr, m); +	print_event_info(buf, m);  	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");  	seq_puts(m, "#              | |       |          |         |\n");  } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)  { -	print_event_info(tr, m); +	print_event_info(buf, m);  	seq_puts(m, "#                              _-----=> irqs-off\n");  	seq_puts(m, "#                             / _----=> need-resched\n");  	seq_puts(m, "#                            | / _---=> hardirq/softirq\n"); @@ -2095,16 +2355,16 @@ void  print_trace_header(struct seq_file *m, struct trace_iterator *iter)  {  	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); -	struct trace_array *tr = iter->tr; -	struct trace_array_cpu *data = tr->data[tr->cpu]; -	struct tracer *type = current_trace; +	struct trace_buffer *buf = iter->trace_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); +	struct tracer *type = iter->trace;  	unsigned long entries;  	unsigned long total;  	const char *name = "preemption";  	name = type->name; -	get_total_entries(tr, &total, &entries); +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",  		   name, UTS_RELEASE); @@ -2115,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  		   nsecs_to_usecs(data->saved_latency),  		   entries,  		   total, -		   tr->cpu, +		   buf->cpu,  #if defined(CONFIG_PREEMPT_NONE)  		   "server",  #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -2166,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)  	if (cpumask_test_cpu(iter->cpu, iter->started))  		return; -	if (iter->tr->data[iter->cpu]->skipped_entries) +	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)  		return;  	cpumask_set_cpu(iter->cpu, iter->started); @@ -2289,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)  	int cpu;  	/* If we are looking at one CPU buffer, only check that one */ -	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {  		cpu = iter->cpu_file;  		buf_iter = trace_buffer_iter(iter, cpu);  		if (buf_iter) {  			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  		return 1; @@ -2308,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)  			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  	} @@ -2332,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)  			return ret;  	} +	if (iter->ent->type == TRACE_BPUTS && +			trace_flags & TRACE_ITER_PRINTK && +			trace_flags & TRACE_ITER_PRINTK_MSGONLY) +		return trace_print_bputs_msg_only(iter); +  	if (iter->ent->type == TRACE_BPRINT &&  			trace_flags & TRACE_ITER_PRINTK &&  			trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2386,9 +2651,9 @@ void trace_default_header(struct seq_file *m)  	} else {  		if (!(trace_flags & TRACE_ITER_VERBOSE)) {  			if (trace_flags & TRACE_ITER_IRQ_INFO) -				print_func_help_header_irq(iter->tr, m); +				print_func_help_header_irq(iter->trace_buffer, m);  			else -				print_func_help_header(iter->tr, m); +				print_func_help_header(iter->trace_buffer, m);  		}  	}  } @@ -2402,14 +2667,8 @@ static void test_ftrace_alive(struct seq_file *m)  }  #ifdef CONFIG_TRACER_MAX_TRACE -static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +static void show_snapshot_main_help(struct seq_file *m)  { -	if (iter->trace->allocated_snapshot) -		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); -	else -		seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); - -	seq_printf(m, "# Snapshot commands:\n");  	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");  	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");  	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n"); @@ -2417,6 +2676,35 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)  	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");  	seq_printf(m, "#                       is not a '0' or '1')\n");  } + +static void show_snapshot_percpu_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n"); +#else +	seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); +	seq_printf(m, "#                     Must use main snapshot file to allocate.\n"); +#endif +	seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ +	if (iter->tr->allocated_snapshot) +		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); +	else +		seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + +	seq_printf(m, "# Snapshot commands:\n"); +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +		show_snapshot_main_help(m); +	else +		show_snapshot_percpu_help(m); +}  #else  /* Should never be called */  static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } @@ -2476,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {  static struct trace_iterator *  __tracing_open(struct inode *inode, struct file *file, bool snapshot)  { -	long cpu_file = (long) inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct trace_iterator *iter;  	int cpu; @@ -2501,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	if (!iter->trace)  		goto fail; -	*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))  		goto fail; -	if (current_trace->print_max || snapshot) -		iter->tr = &max_tr; +	iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE +	/* Currently only the top directory has a snapshot */ +	if (tr->current_trace->print_max || snapshot) +		iter->trace_buffer = &tr->max_buffer;  	else -		iter->tr = &global_trace; +#endif +		iter->trace_buffer = &tr->trace_buffer;  	iter->snapshot = snapshot;  	iter->pos = -1;  	mutex_init(&iter->mutex); -	iter->cpu_file = cpu_file; +	iter->cpu_file = tc->cpu;  	/* Notify the tracer early; before we stop tracing. */  	if (iter->trace && iter->trace->open)  		iter->trace->open(iter);  	/* Annotate start of buffers if we had overruns */ -	if (ring_buffer_overruns(iter->tr->buffer)) +	if (ring_buffer_overruns(iter->trace_buffer->buffer))  		iter->iter_flags |= TRACE_FILE_ANNOTATE;  	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -2529,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	/* stop the trace while dumping if we are not opening "snapshot" */  	if (!iter->snapshot) -		tracing_stop(); +		tracing_stop_tr(tr); -	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter->buffer_iter[cpu] = -				ring_buffer_read_prepare(iter->tr->buffer, cpu); +				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		}  		ring_buffer_read_prepare_sync();  		for_each_tracing_cpu(cpu) { @@ -2544,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  	} else {  		cpu = iter->cpu_file;  		iter->buffer_iter[cpu] = -			ring_buffer_read_prepare(iter->tr->buffer, cpu); +			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		ring_buffer_read_prepare_sync();  		ring_buffer_read_start(iter->buffer_iter[cpu]);  		tracing_iter_reset(iter, cpu);  	} +	tr->ref++; +  	mutex_unlock(&trace_types_lock);  	return iter; @@ -2576,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)  {  	struct seq_file *m = file->private_data;  	struct trace_iterator *iter; +	struct trace_array *tr;  	int cpu;  	if (!(file->f_mode & FMODE_READ))  		return 0;  	iter = m->private; +	tr = iter->tr;  	mutex_lock(&trace_types_lock); + +	WARN_ON(!tr->ref); +	tr->ref--; +  	for_each_tracing_cpu(cpu) {  		if (iter->buffer_iter[cpu])  			ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2594,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)  	if (!iter->snapshot)  		/* reenable tracing if it was previously enabled */ -		tracing_start(); +		tracing_start_tr(tr);  	mutex_unlock(&trace_types_lock);  	mutex_destroy(&iter->mutex); @@ -2613,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)  	/* If this file was open for write, then erase contents */  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) { -		long cpu = (long) inode->i_private; +		struct trace_cpu *tc = inode->i_private; +		struct trace_array *tr = tc->tr; -		if (cpu == TRACE_PIPE_ALL_CPU) -			tracing_reset_online_cpus(&global_trace); +		if (tc->cpu == RING_BUFFER_ALL_CPUS) +			tracing_reset_online_cpus(&tr->trace_buffer);  		else -			tracing_reset(&global_trace, cpu); +			tracing_reset(&tr->trace_buffer, tc->cpu);  	}  	if (file->f_mode & FMODE_READ) { @@ -2765,8 +3068,9 @@ static ssize_t  tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		      size_t count, loff_t *ppos)  { -	int err, cpu; +	struct trace_array *tr = filp->private_data;  	cpumask_var_t tracing_cpumask_new; +	int err, cpu;  	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))  		return -ENOMEM; @@ -2786,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		 */  		if (cpumask_test_cpu(cpu, tracing_cpumask) &&  				!cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_inc(&global_trace.data[cpu]->disabled); -			ring_buffer_record_disable_cpu(global_trace.buffer, cpu); +			atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);  		}  		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&  				cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_dec(&global_trace.data[cpu]->disabled); -			ring_buffer_record_enable_cpu(global_trace.buffer, cpu); +			atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);  		}  	}  	arch_spin_unlock(&ftrace_max_lock); @@ -2821,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {  static int tracing_trace_options_show(struct seq_file *m, void *v)  {  	struct tracer_opt *trace_opts; +	struct trace_array *tr = m->private;  	u32 tracer_flags;  	int i;  	mutex_lock(&trace_types_lock); -	tracer_flags = current_trace->flags->val; -	trace_opts = current_trace->flags->opts; +	tracer_flags = tr->current_trace->flags->val; +	trace_opts = tr->current_trace->flags->opts;  	for (i = 0; trace_options[i]; i++) {  		if (trace_flags & (1 << i)) @@ -2890,15 +3195,15 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)  	return 0;  } -int set_tracer_flag(unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)  {  	/* do nothing if flag is already set */  	if (!!(trace_flags & mask) == !!enabled)  		return 0;  	/* Give the tracer a chance to approve the change */ -	if (current_trace->flag_changed) -		if (current_trace->flag_changed(current_trace, mask, !!enabled)) +	if (tr->current_trace->flag_changed) +		if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))  			return -EINVAL;  	if (enabled) @@ -2910,9 +3215,9 @@ int set_tracer_flag(unsigned int mask, int enabled)  		trace_event_enable_cmd_record(enabled);  	if (mask == TRACE_ITER_OVERWRITE) { -		ring_buffer_change_overwrite(global_trace.buffer, enabled); +		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);  #ifdef CONFIG_TRACER_MAX_TRACE -		ring_buffer_change_overwrite(max_tr.buffer, enabled); +		ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);  #endif  	} @@ -2922,7 +3227,7 @@ int set_tracer_flag(unsigned int mask, int enabled)  	return 0;  } -static int trace_set_options(char *option) +static int trace_set_options(struct trace_array *tr, char *option)  {  	char *cmp;  	int neg = 0; @@ -2940,14 +3245,14 @@ static int trace_set_options(char *option)  	for (i = 0; trace_options[i]; i++) {  		if (strcmp(cmp, trace_options[i]) == 0) { -			ret = set_tracer_flag(1 << i, !neg); +			ret = set_tracer_flag(tr, 1 << i, !neg);  			break;  		}  	}  	/* If no option could be set, test the specific tracer options */  	if (!trace_options[i]) -		ret = set_tracer_option(current_trace, cmp, neg); +		ret = set_tracer_option(tr->current_trace, cmp, neg);  	mutex_unlock(&trace_types_lock); @@ -2958,6 +3263,8 @@ static ssize_t  tracing_trace_options_write(struct file *filp, const char __user *ubuf,  			size_t cnt, loff_t *ppos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64];  	int ret; @@ -2969,7 +3276,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,  	buf[cnt] = 0; -	ret = trace_set_options(buf); +	ret = trace_set_options(tr, buf);  	if (ret < 0)  		return ret; @@ -2982,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)  {  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_trace_options_show, NULL); + +	return single_open(file, tracing_trace_options_show, inode->i_private);  }  static const struct file_operations tracing_iter_fops = { @@ -2995,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {  static const char readme_msg[] =  	"tracing mini-HOWTO:\n\n" -	"# mount -t debugfs nodev /sys/kernel/debug\n\n" -	"# cat /sys/kernel/debug/tracing/available_tracers\n" -	"wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"nop\n" -	"# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"wakeup\n" -	"# cat /sys/kernel/debug/tracing/trace_options\n" -	"noprint-parent nosym-offset nosym-addr noverbose\n" -	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" -	"# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" -	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" -	"# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" +	"# echo 0 > tracing_on : quick way to disable tracing\n" +	"# echo 1 > tracing_on : quick way to re-enable tracing\n\n" +	" Important files:\n" +	"  trace\t\t\t- The static contents of the buffer\n" +	"\t\t\t  To clear the buffer write into this file: echo > trace\n" +	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n" +	"  current_tracer\t- function and latency tracers\n" +	"  available_tracers\t- list of configured tracers for current_tracer\n" +	"  buffer_size_kb\t- view and modify size of per cpu buffer\n" +	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n" +	"  trace_clock\t\t-change the clock used to order events\n" +	"       local:   Per cpu clock but may not be synced across CPUs\n" +	"      global:   Synced across CPUs but slows tracing down.\n" +	"     counter:   Not a clock, but just an increment\n" +	"      uptime:   Jiffy counter from time of boot\n" +	"        perf:   Same clock that perf events use\n" +#ifdef CONFIG_X86_64 +	"     x86-tsc:   TSC cycle counter\n" +#endif +	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n" +	"  tracing_cpumask\t- Limit which CPUs to trace\n" +	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n" +	"\t\t\t  Remove sub-buffer with rmdir\n" +	"  trace_options\t\t- Set format or modify how tracing happens\n" +	"\t\t\t  Disable an option by adding a suffix 'no' to the option name\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"\n  available_filter_functions - list of functions that can be filtered on\n" +	"  set_ftrace_filter\t- echo function name in here to only trace these functions\n" +	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"            modules: Can select a group via module\n" +	"             Format: :mod:<module-name>\n" +	"             example: echo :mod:ext3 > set_ftrace_filter\n" +	"            triggers: a command to perform when function is hit\n" +	"              Format: <function>:<trigger>[:count]\n" +	"             trigger: traceon, traceoff\n" +	"                      enable_event:<system>:<event>\n" +	"                      disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE +	"                      stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"                      snapshot\n" +#endif +	"             example: echo do_fault:traceoff > set_ftrace_filter\n" +	"                      echo do_trap:traceoff:3 > set_ftrace_filter\n" +	"             The first one will disable tracing every time do_fault is hit\n" +	"             The second will disable tracing at most 3 times when do_trap is hit\n" +	"               The first time do trap is hit and it disables tracing, the counter\n" +	"               will decrement to 2. If tracing is already disabled, the counter\n" +	"               will not decrement. It only decrements when the trigger did work\n" +	"             To remove trigger without count:\n" +	"               echo '!<function>:<trigger> > set_ftrace_filter\n" +	"             To remove trigger with a count:\n" +	"               echo '!<function>:<trigger>:0 > set_ftrace_filter\n" +	"  set_ftrace_notrace\t- echo function name in here to never trace.\n" +	"            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"            modules: Can select a group via module command :mod:\n" +	"            Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER +	"  set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n" +	"  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" +	"\t\t\t  Read the contents for more information\n" +#endif +#ifdef CONFIG_STACKTRACE +	"  stack_trace\t\t- Shows the max stack trace when active\n" +	"  stack_max_size\t- Shows current max stack size that was traced\n" +	"\t\t\t  Write into this file to reset the max size (trigger a new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" +#endif +#endif /* CONFIG_STACKTRACE */  ;  static ssize_t @@ -3080,11 +3452,12 @@ static ssize_t  tracing_set_trace_read(struct file *filp, char __user *ubuf,  		       size_t cnt, loff_t *ppos)  { +	struct trace_array *tr = filp->private_data;  	char buf[MAX_TRACER_SIZE+2];  	int r;  	mutex_lock(&trace_types_lock); -	r = sprintf(buf, "%s\n", current_trace->name); +	r = sprintf(buf, "%s\n", tr->current_trace->name);  	mutex_unlock(&trace_types_lock);  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3092,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,  int tracer_init(struct tracer *t, struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	return t->init(tr);  } -static void set_buffer_entries(struct trace_array *tr, unsigned long val) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)  {  	int cpu; +  	for_each_tracing_cpu(cpu) -		tr->data[cpu]->entries = val; +		per_cpu_ptr(buf->data, cpu)->entries = val;  } +#ifdef CONFIG_TRACER_MAX_TRACE  /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_array *tr, -					struct trace_array *size_tr, int cpu_id) +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id)  {  	int cpu, ret = 0;  	if (cpu_id == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) { -			ret = ring_buffer_resize(tr->buffer, -					size_tr->data[cpu]->entries, cpu); +			ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);  			if (ret < 0)  				break; -			tr->data[cpu]->entries = size_tr->data[cpu]->entries; +			per_cpu_ptr(trace_buf->data, cpu)->entries = +				per_cpu_ptr(size_buf->data, cpu)->entries;  		}  	} else { -		ret = ring_buffer_resize(tr->buffer, -					size_tr->data[cpu_id]->entries, cpu_id); +		ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);  		if (ret == 0) -			tr->data[cpu_id]->entries = -				size_tr->data[cpu_id]->entries; +			per_cpu_ptr(trace_buf->data, cpu_id)->entries = +				per_cpu_ptr(size_buf->data, cpu_id)->entries;  	}  	return ret;  } +#endif /* CONFIG_TRACER_MAX_TRACE */ -static int __tracing_resize_ring_buffer(unsigned long size, int cpu) +static int __tracing_resize_ring_buffer(struct trace_array *tr, +					unsigned long size, int cpu)  {  	int ret; @@ -3137,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  	 * we use the size that was given, and we can forget about  	 * expanding it later.  	 */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	/* May be called before buffers are initialized */ -	if (!global_trace.buffer) +	if (!tr->trace_buffer.buffer)  		return 0; -	ret = ring_buffer_resize(global_trace.buffer, size, cpu); +	ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);  	if (ret < 0)  		return ret; -	if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE +	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || +	    !tr->current_trace->use_max_tr)  		goto out; -	ret = ring_buffer_resize(max_tr.buffer, size, cpu); +	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);  	if (ret < 0) { -		int r = resize_buffer_duplicate_size(&global_trace, -						     &global_trace, cpu); +		int r = resize_buffer_duplicate_size(&tr->trace_buffer, +						     &tr->trace_buffer, cpu);  		if (r < 0) {  			/*  			 * AARGH! We are left with different @@ -3176,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)  	}  	if (cpu == RING_BUFFER_ALL_CPUS) -		set_buffer_entries(&max_tr, size); +		set_buffer_entries(&tr->max_buffer, size);  	else -		max_tr.data[cpu]->entries = size; +		per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;   out: +#endif /* CONFIG_TRACER_MAX_TRACE */ +  	if (cpu == RING_BUFFER_ALL_CPUS) -		set_buffer_entries(&global_trace, size); +		set_buffer_entries(&tr->trace_buffer, size);  	else -		global_trace.data[cpu]->entries = size; +		per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;  	return ret;  } -static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, +					  unsigned long size, int cpu_id)  {  	int ret = size; @@ -3203,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)  		}  	} -	ret = __tracing_resize_ring_buffer(size, cpu_id); +	ret = __tracing_resize_ring_buffer(tr, size, cpu_id);  	if (ret < 0)  		ret = -ENOMEM; @@ -3230,7 +3613,7 @@ int tracing_update_buffers(void)  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) -		ret = __tracing_resize_ring_buffer(trace_buf_size, +		ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,  						RING_BUFFER_ALL_CPUS);  	mutex_unlock(&trace_types_lock); @@ -3240,7 +3623,7 @@ int tracing_update_buffers(void)  struct trace_option_dentry;  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer);  static void  destroy_trace_option_files(struct trace_option_dentry *topts); @@ -3250,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)  	static struct trace_option_dentry *topts;  	struct trace_array *tr = &global_trace;  	struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE  	bool had_max_tr; +#endif  	int ret = 0;  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) { -		ret = __tracing_resize_ring_buffer(trace_buf_size, +		ret = __tracing_resize_ring_buffer(tr, trace_buf_size,  						RING_BUFFER_ALL_CPUS);  		if (ret < 0)  			goto out; @@ -3271,18 +3656,21 @@ static int tracing_set_tracer(const char *buf)  		ret = -EINVAL;  		goto out;  	} -	if (t == current_trace) +	if (t == tr->current_trace)  		goto out;  	trace_branch_disable(); -	current_trace->enabled = false; +	tr->current_trace->enabled = false; + +	if (tr->current_trace->reset) +		tr->current_trace->reset(tr); -	if (current_trace->reset) -		current_trace->reset(tr); +	/* Current trace needs to be nop_trace before synchronize_sched */ +	tr->current_trace = &nop_trace; -	had_max_tr = current_trace->allocated_snapshot; -	current_trace = &nop_trace; +#ifdef CONFIG_TRACER_MAX_TRACE +	had_max_tr = tr->allocated_snapshot;  	if (had_max_tr && !t->use_max_tr) {  		/* @@ -3293,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)  		 * so a synchronized_sched() is sufficient.  		 */  		synchronize_sched(); -		/* -		 * We don't free the ring buffer. instead, resize it because -		 * The max_tr ring buffer has some state (e.g. ring->clock) and -		 * we want preserve it. -		 */ -		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); -		set_buffer_entries(&max_tr, 1); -		tracing_reset_online_cpus(&max_tr); -		current_trace->allocated_snapshot = false; +		free_snapshot(tr);  	} +#endif  	destroy_trace_option_files(topts); -	topts = create_trace_option_files(t); +	topts = create_trace_option_files(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE  	if (t->use_max_tr && !had_max_tr) { -		/* we need to make per cpu buffer sizes equivalent */ -		ret = resize_buffer_duplicate_size(&max_tr, &global_trace, -						   RING_BUFFER_ALL_CPUS); +		ret = alloc_snapshot(tr);  		if (ret < 0)  			goto out; -		t->allocated_snapshot = true;  	} +#endif  	if (t->init) {  		ret = tracer_init(t, tr); @@ -3321,8 +3702,8 @@ static int tracing_set_tracer(const char *buf)  			goto out;  	} -	current_trace = t; -	current_trace->enabled = true; +	tr->current_trace = t; +	tr->current_trace->enabled = true;  	trace_branch_enable(tr);   out:  	mutex_unlock(&trace_types_lock); @@ -3396,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  static int tracing_open_pipe(struct inode *inode, struct file *filp)  { -	long cpu_file = (long) inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct trace_iterator *iter;  	int ret = 0; @@ -3421,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  		ret = -ENOMEM;  		goto fail;  	} -	*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {  		ret = -ENOMEM; @@ -3438,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  	if (trace_clocks[trace_clock_id].in_ns)  		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; -	iter->cpu_file = cpu_file; -	iter->tr = &global_trace; +	iter->cpu_file = tc->cpu; +	iter->tr = tc->tr; +	iter->trace_buffer = &tc->tr->trace_buffer;  	mutex_init(&iter->mutex);  	filp->private_data = iter; @@ -3478,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)  }  static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)  { -	struct trace_iterator *iter = filp->private_data; +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return POLLIN | POLLRDNORM; -	if (trace_flags & TRACE_ITER_BLOCK) { +	if (trace_flags & TRACE_ITER_BLOCK)  		/*  		 * Always select as readable when in blocking mode  		 */  		return POLLIN | POLLRDNORM; -	} else { -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; -		poll_wait(filp, &trace_wait, poll_table); -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; +	else +		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, +					     filp, poll_table); +} -		return 0; -	} +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ +	struct trace_iterator *iter = filp->private_data; + +	return trace_poll(iter, filp, poll_table);  }  /* @@ -3561,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  {  	struct trace_iterator *iter = filp->private_data; +	struct trace_array *tr = iter->tr;  	ssize_t sret;  	/* return any leftover data */ @@ -3572,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	/* @@ -3729,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		.ops		= &tracing_pipe_buf_ops,  		.spd_release	= tracing_spd_release_pipe,  	}; +	struct trace_array *tr = iter->tr;  	ssize_t ret;  	size_t rem;  	unsigned int i; @@ -3738,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(iter->trace->name != current_trace->name)) -		*iter->trace = *current_trace; +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	mutex_lock(&iter->mutex); @@ -3801,43 +4190,19 @@ out_err:  	goto out;  } -struct ftrace_entries_info { -	struct trace_array	*tr; -	int			cpu; -}; - -static int tracing_entries_open(struct inode *inode, struct file *filp) -{ -	struct ftrace_entries_info *info; - -	if (tracing_disabled) -		return -ENODEV; - -	info = kzalloc(sizeof(*info), GFP_KERNEL); -	if (!info) -		return -ENOMEM; - -	info->tr = &global_trace; -	info->cpu = (unsigned long)inode->i_private; - -	filp->private_data = info; - -	return 0; -} -  static ssize_t  tracing_entries_read(struct file *filp, char __user *ubuf,  		     size_t cnt, loff_t *ppos)  { -	struct ftrace_entries_info *info = filp->private_data; -	struct trace_array *tr = info->tr; +	struct trace_cpu *tc = filp->private_data; +	struct trace_array *tr = tc->tr;  	char buf[64];  	int r = 0;  	ssize_t ret;  	mutex_lock(&trace_types_lock); -	if (info->cpu == RING_BUFFER_ALL_CPUS) { +	if (tc->cpu == RING_BUFFER_ALL_CPUS) {  		int cpu, buf_size_same;  		unsigned long size; @@ -3847,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		for_each_tracing_cpu(cpu) {  			/* fill in the size from first enabled cpu */  			if (size == 0) -				size = tr->data[cpu]->entries; -			if (size != tr->data[cpu]->entries) { +				size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; +			if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {  				buf_size_same = 0;  				break;  			} @@ -3864,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		} else  			r = sprintf(buf, "X\n");  	} else -		r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); +		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);  	mutex_unlock(&trace_types_lock); @@ -3876,7 +4241,7 @@ static ssize_t  tracing_entries_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  { -	struct ftrace_entries_info *info = filp->private_data; +	struct trace_cpu *tc = filp->private_data;  	unsigned long val;  	int ret; @@ -3891,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	/* value is in KB */  	val <<= 10; -	ret = tracing_resize_ring_buffer(val, info->cpu); +	ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);  	if (ret < 0)  		return ret; @@ -3900,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	return cnt;  } -static int -tracing_entries_release(struct inode *inode, struct file *filp) -{ -	struct ftrace_entries_info *info = filp->private_data; - -	kfree(info); - -	return 0; -} -  static ssize_t  tracing_total_entries_read(struct file *filp, char __user *ubuf,  				size_t cnt, loff_t *ppos) @@ -3921,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,  	mutex_lock(&trace_types_lock);  	for_each_tracing_cpu(cpu) { -		size += tr->data[cpu]->entries >> 10; +		size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;  		if (!ring_buffer_expanded)  			expanded_size += trace_buf_size >> 10;  	} @@ -3951,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,  static int  tracing_free_buffer_release(struct inode *inode, struct file *filp)  { +	struct trace_array *tr = inode->i_private; +  	/* disable tracing ? */  	if (trace_flags & TRACE_ITER_STOP_ON_FREE)  		tracing_off();  	/* resize the ring buffer to 0 */ -	tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); +	tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);  	return 0;  } @@ -4024,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	local_save_flags(irq_flags);  	size = sizeof(*entry) + cnt + 2; /* possible \n added */ -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  irq_flags, preempt_count());  	if (!event) { @@ -4066,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  static int tracing_clock_show(struct seq_file *m, void *v)  { +	struct trace_array *tr = m->private;  	int i;  	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)  		seq_printf(m,  			"%s%s%s%s", i ? " " : "", -			i == trace_clock_id ? "[" : "", trace_clocks[i].name, -			i == trace_clock_id ? "]" : ""); +			i == tr->clock_id ? "[" : "", trace_clocks[i].name, +			i == tr->clock_id ? "]" : "");  	seq_putc(m, '\n');  	return 0; @@ -4081,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)  static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  				   size_t cnt, loff_t *fpos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64];  	const char *clockstr;  	int i; @@ -4102,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  	if (i == ARRAY_SIZE(trace_clocks))  		return -EINVAL; -	trace_clock_id = i; -  	mutex_lock(&trace_types_lock); -	ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); -	if (max_tr.buffer) -		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); +	tr->clock_id = i; + +	ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);  	/*  	 * New clock may not be consistent with the previous clock.  	 * Reset the buffer so that it doesn't have incomparable timestamps.  	 */ -	tracing_reset_online_cpus(&global_trace); -	tracing_reset_online_cpus(&max_tr); +	tracing_reset_online_cpus(&global_trace.trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) +		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); +	tracing_reset_online_cpus(&global_trace.max_buffer); +#endif  	mutex_unlock(&trace_types_lock); @@ -4128,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)  {  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_clock_show, NULL); + +	return single_open(file, tracing_clock_show, inode->i_private);  } +struct ftrace_buffer_info { +	struct trace_iterator	iter; +	void			*spare; +	unsigned int		read; +}; +  #ifdef CONFIG_TRACER_SNAPSHOT  static int tracing_snapshot_open(struct inode *inode, struct file *file)  { +	struct trace_cpu *tc = inode->i_private;  	struct trace_iterator *iter; +	struct seq_file *m;  	int ret = 0;  	if (file->f_mode & FMODE_READ) {  		iter = __tracing_open(inode, file, true);  		if (IS_ERR(iter))  			ret = PTR_ERR(iter); +	} else { +		/* Writes still need the seq_file to hold the private data */ +		m = kzalloc(sizeof(*m), GFP_KERNEL); +		if (!m) +			return -ENOMEM; +		iter = kzalloc(sizeof(*iter), GFP_KERNEL); +		if (!iter) { +			kfree(m); +			return -ENOMEM; +		} +		iter->tr = tc->tr; +		iter->trace_buffer = &tc->tr->max_buffer; +		iter->cpu_file = tc->cpu; +		m->private = iter; +		file->private_data = m;  	} +  	return ret;  } @@ -4149,6 +4537,9 @@ static ssize_t  tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { +	struct seq_file *m = filp->private_data; +	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr;  	unsigned long val;  	int ret; @@ -4162,40 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,  	mutex_lock(&trace_types_lock); -	if (current_trace->use_max_tr) { +	if (tr->current_trace->use_max_tr) {  		ret = -EBUSY;  		goto out;  	}  	switch (val) {  	case 0: -		if (current_trace->allocated_snapshot) { -			/* free spare buffer */ -			ring_buffer_resize(max_tr.buffer, 1, -					   RING_BUFFER_ALL_CPUS); -			set_buffer_entries(&max_tr, 1); -			tracing_reset_online_cpus(&max_tr); -			current_trace->allocated_snapshot = false; +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break;  		} +		if (tr->allocated_snapshot) +			free_snapshot(tr);  		break;  	case 1: -		if (!current_trace->allocated_snapshot) { -			/* allocate spare buffer */ -			ret = resize_buffer_duplicate_size(&max_tr, -					&global_trace, RING_BUFFER_ALL_CPUS); +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break; +		} +#endif +		if (!tr->allocated_snapshot) { +			ret = alloc_snapshot(tr);  			if (ret < 0)  				break; -			current_trace->allocated_snapshot = true;  		} -  		local_irq_disable();  		/* Now, we're going to swap */ -		update_max_tr(&global_trace, current, smp_processor_id()); +		if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +			update_max_tr(tr, current, smp_processor_id()); +		else +			update_max_tr_single(tr, current, iter->cpu_file);  		local_irq_enable();  		break;  	default: -		if (current_trace->allocated_snapshot) -			tracing_reset_online_cpus(&max_tr); +		if (tr->allocated_snapshot) { +			if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +				tracing_reset_online_cpus(&tr->max_buffer); +			else +				tracing_reset(&tr->max_buffer, iter->cpu_file); +		}  		break;  	} @@ -4207,6 +4606,51 @@ out:  	mutex_unlock(&trace_types_lock);  	return ret;  } + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ +	struct seq_file *m = file->private_data; + +	if (file->f_mode & FMODE_READ) +		return tracing_release(inode, file); + +	/* If write only, the seq_file is just a stub */ +	if (m) +		kfree(m->private); +	kfree(m); + +	return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, +				    size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, +		   struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_buffer_info *info; +	int ret; + +	ret = tracing_buffers_open(inode, filp); +	if (ret < 0) +		return ret; + +	info = filp->private_data; + +	if (info->iter.trace->use_max_tr) { +		tracing_buffers_release(inode, filp); +		return -EBUSY; +	} + +	info->iter.snapshot = true; +	info->iter.trace_buffer = &info->iter.tr->max_buffer; + +	return ret; +} +  #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4234,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {  };  static const struct file_operations tracing_entries_fops = { -	.open		= tracing_entries_open, +	.open		= tracing_open_generic,  	.read		= tracing_entries_read,  	.write		= tracing_entries_write, -	.release	= tracing_entries_release,  	.llseek		= generic_file_llseek,  }; @@ -4272,20 +4715,23 @@ static const struct file_operations snapshot_fops = {  	.read		= seq_read,  	.write		= tracing_snapshot_write,  	.llseek		= tracing_seek, -	.release	= tracing_release, +	.release	= tracing_snapshot_release,  }; -#endif /* CONFIG_TRACER_SNAPSHOT */ -struct ftrace_buffer_info { -	struct trace_array	*tr; -	void			*spare; -	int			cpu; -	unsigned int		read; +static const struct file_operations snapshot_raw_fops = { +	.open		= snapshot_raw_open, +	.read		= tracing_buffers_read, +	.release	= tracing_buffers_release, +	.splice_read	= tracing_buffers_splice_read, +	.llseek		= no_llseek,  }; +#endif /* CONFIG_TRACER_SNAPSHOT */ +  static int tracing_buffers_open(struct inode *inode, struct file *filp)  { -	int cpu = (int)(long)inode->i_private; +	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = tc->tr;  	struct ftrace_buffer_info *info;  	if (tracing_disabled) @@ -4295,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)  	if (!info)  		return -ENOMEM; -	info->tr	= &global_trace; -	info->cpu	= cpu; -	info->spare	= NULL; +	mutex_lock(&trace_types_lock); + +	tr->ref++; + +	info->iter.tr		= tr; +	info->iter.cpu_file	= tc->cpu; +	info->iter.trace	= tr->current_trace; +	info->iter.trace_buffer = &tr->trace_buffer; +	info->spare		= NULL;  	/* Force reading ring buffer for first read */ -	info->read	= (unsigned int)-1; +	info->read		= (unsigned int)-1;  	filp->private_data = info; +	mutex_unlock(&trace_types_lock); +  	return nonseekable_open(inode, filp);  } +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ +	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter; + +	return trace_poll(iter, filp, poll_table); +} +  static ssize_t  tracing_buffers_read(struct file *filp, char __user *ubuf,  		     size_t count, loff_t *ppos)  {  	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter;  	ssize_t ret; -	size_t size; +	ssize_t size;  	if (!count)  		return 0; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		size = -EBUSY; +		goto out_unlock; +	} +#endif +  	if (!info->spare) -		info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); +		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, +							  iter->cpu_file); +	size = -ENOMEM;  	if (!info->spare) -		return -ENOMEM; +		goto out_unlock;  	/* Do we have previous read data to read? */  	if (info->read < PAGE_SIZE)  		goto read; -	trace_access_lock(info->cpu); -	ret = ring_buffer_read_page(info->tr->buffer, + again: +	trace_access_lock(iter->cpu_file); +	ret = ring_buffer_read_page(iter->trace_buffer->buffer,  				    &info->spare,  				    count, -				    info->cpu, 0); -	trace_access_unlock(info->cpu); -	if (ret < 0) -		return 0; +				    iter->cpu_file, 0); +	trace_access_unlock(iter->cpu_file); -	info->read = 0; +	if (ret < 0) { +		if (trace_empty(iter)) { +			if ((filp->f_flags & O_NONBLOCK)) { +				size = -EAGAIN; +				goto out_unlock; +			} +			mutex_unlock(&trace_types_lock); +			iter->trace->wait_pipe(iter); +			mutex_lock(&trace_types_lock); +			if (signal_pending(current)) { +				size = -EINTR; +				goto out_unlock; +			} +			goto again; +		} +		size = 0; +		goto out_unlock; +	} -read: +	info->read = 0; + read:  	size = PAGE_SIZE - info->read;  	if (size > count)  		size = count;  	ret = copy_to_user(ubuf, info->spare + info->read, size); -	if (ret == size) -		return -EFAULT; +	if (ret == size) { +		size = -EFAULT; +		goto out_unlock; +	}  	size -= ret;  	*ppos += size;  	info->read += size; + out_unlock: +	mutex_unlock(&trace_types_lock); +  	return size;  }  static int tracing_buffers_release(struct inode *inode, struct file *file)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter; + +	mutex_lock(&trace_types_lock); + +	WARN_ON(!iter->tr->ref); +	iter->tr->ref--;  	if (info->spare) -		ring_buffer_free_read_page(info->tr->buffer, info->spare); +		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);  	kfree(info); +	mutex_unlock(&trace_types_lock); +  	return 0;  } @@ -4425,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			    unsigned int flags)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter;  	struct partial_page partial_def[PIPE_DEF_BUFFERS];  	struct page *pages_def[PIPE_DEF_BUFFERS];  	struct splice_pipe_desc spd = { @@ -4437,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  	};  	struct buffer_ref *ref;  	int entries, size, i; -	size_t ret; +	ssize_t ret; -	if (splice_grow_spd(pipe, &spd)) -		return -ENOMEM; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		ret = -EBUSY; +		goto out; +	} +#endif + +	if (splice_grow_spd(pipe, &spd)) { +		ret = -ENOMEM; +		goto out; +	}  	if (*ppos & (PAGE_SIZE - 1)) {  		ret = -EINVAL; @@ -4455,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		len &= PAGE_MASK;  	} -	trace_access_lock(info->cpu); -	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: +	trace_access_lock(iter->cpu_file); +	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {  		struct page *page; @@ -4467,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			break;  		ref->ref = 1; -		ref->buffer = info->tr->buffer; -		ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); +		ref->buffer = iter->trace_buffer->buffer; +		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);  		if (!ref->page) {  			kfree(ref);  			break;  		}  		r = ring_buffer_read_page(ref->buffer, &ref->page, -					  len, info->cpu, 1); +					  len, iter->cpu_file, 1);  		if (r < 0) {  			ring_buffer_free_read_page(ref->buffer, ref->page);  			kfree(ref); @@ -4499,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		spd.nr_pages++;  		*ppos += PAGE_SIZE; -		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); +		entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	} -	trace_access_unlock(info->cpu); +	trace_access_unlock(iter->cpu_file);  	spd.nr_pages = i;  	/* did we read anything? */  	if (!spd.nr_pages) { -		if (flags & SPLICE_F_NONBLOCK) +		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {  			ret = -EAGAIN; -		else -			ret = 0; -		/* TODO: block */ -		goto out; +			goto out; +		} +		mutex_unlock(&trace_types_lock); +		iter->trace->wait_pipe(iter); +		mutex_lock(&trace_types_lock); +		if (signal_pending(current)) { +			ret = -EINTR; +			goto out; +		} +		goto again;  	}  	ret = splice_to_pipe(pipe, &spd);  	splice_shrink_spd(&spd);  out: +	mutex_unlock(&trace_types_lock); +  	return ret;  }  static const struct file_operations tracing_buffers_fops = {  	.open		= tracing_buffers_open,  	.read		= tracing_buffers_read, +	.poll		= tracing_buffers_poll,  	.release	= tracing_buffers_release,  	.splice_read	= tracing_buffers_splice_read,  	.llseek		= no_llseek, @@ -4533,12 +5060,14 @@ static ssize_t  tracing_stats_read(struct file *filp, char __user *ubuf,  		   size_t count, loff_t *ppos)  { -	unsigned long cpu = (unsigned long)filp->private_data; -	struct trace_array *tr = &global_trace; +	struct trace_cpu *tc = filp->private_data; +	struct trace_array *tr = tc->tr; +	struct trace_buffer *trace_buf = &tr->trace_buffer;  	struct trace_seq *s;  	unsigned long cnt;  	unsigned long long t;  	unsigned long usec_rem; +	int cpu = tc->cpu;  	s = kmalloc(sizeof(*s), GFP_KERNEL);  	if (!s) @@ -4546,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  	trace_seq_init(s); -	cnt = ring_buffer_entries_cpu(tr->buffer, cpu); +	cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "entries: %ld\n", cnt); -	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "overrun: %ld\n", cnt); -	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "commit overrun: %ld\n", cnt); -	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); +	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "bytes: %ld\n", cnt);  	if (trace_clocks[trace_clock_id].in_ns) {  		/* local or global for trace_clock */ -		t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); +		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));  		usec_rem = do_div(t, USEC_PER_SEC);  		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",  								t, usec_rem); -		t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); +		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));  		usec_rem = do_div(t, USEC_PER_SEC);  		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);  	} else {  		/* counter or tsc mode for trace_clock */  		trace_seq_printf(s, "oldest event ts: %llu\n", -				ring_buffer_oldest_event_ts(tr->buffer, cpu)); +				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));  		trace_seq_printf(s, "now ts: %llu\n", -				ring_buffer_time_stamp(tr->buffer, cpu)); +				ring_buffer_time_stamp(trace_buf->buffer, cpu));  	} -	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); +	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "dropped events: %ld\n", cnt); -	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); +	cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "read events: %ld\n", cnt);  	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4632,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {  	.read		= tracing_read_dyn_info,  	.llseek		= generic_file_llseek,  }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	unsigned long *count = (long *)data; + +	if (!*count) +		return; + +	if (*count != -1) +		(*count)--; + +	tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *data) +{ +	long count = (long)data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "snapshot"); + +	if (count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", count); + +	return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { +	.func			= ftrace_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { +	.func			= ftrace_count_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +			       char *glob, char *cmd, char *param, int enable)  { -	static int once; +	struct ftrace_probe_ops *ops; +	void *count = (void *)-1; +	char *number; +	int ret; -	if (d_tracer) -		return d_tracer; +	/* hash funcs only work with set_ftrace_filter */ +	if (!enable) +		return -EINVAL; + +	ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	} + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	if (!strlen(number)) +		goto out_reg; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, (unsigned long *)&count); +	if (ret) +		return ret; + + out_reg: +	ret = register_ftrace_function_probe(glob, ops, count); + +	if (ret >= 0) +		alloc_snapshot(&global_trace); + +	return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { +	.name			= "snapshot", +	.func			= ftrace_trace_snapshot_callback, +}; + +static int register_snapshot_cmd(void) +{ +	return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ + +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ +	if (tr->dir) +		return tr->dir;  	if (!debugfs_initialized())  		return NULL; -	d_tracer = debugfs_create_dir("tracing", NULL); +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		tr->dir = debugfs_create_dir("tracing", NULL); -	if (!d_tracer && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'tracing'\n"); -		return NULL; -	} +	if (!tr->dir) +		pr_warn_once("Could not create debugfs directory 'tracing'\n"); -	return d_tracer; +	return tr->dir;  } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ +	return tracing_init_dentry_tr(&global_trace); +} -static struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)  { -	static int once;  	struct dentry *d_tracer; -	if (d_percpu) -		return d_percpu; - -	d_tracer = tracing_init_dentry(); +	if (tr->percpu_dir) +		return tr->percpu_dir; +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	d_percpu = debugfs_create_dir("per_cpu", d_tracer); +	tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); -	if (!d_percpu && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'per_cpu'\n"); -		return NULL; -	} +	WARN_ONCE(!tr->percpu_dir, +		  "Could not create debugfs directory 'per_cpu/%d'\n", cpu); -	return d_percpu; +	return tr->percpu_dir;  } -static void tracing_init_debugfs_percpu(long cpu) +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)  { -	struct dentry *d_percpu = tracing_dentry_percpu(); +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);  	struct dentry *d_cpu;  	char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -4701,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)  	/* per cpu trace_pipe */  	trace_create_file("trace_pipe", 0444, d_cpu, -			(void *) cpu, &tracing_pipe_fops); +			(void *)&data->trace_cpu, &tracing_pipe_fops);  	/* per cpu trace */  	trace_create_file("trace", 0644, d_cpu, -			(void *) cpu, &tracing_fops); +			(void *)&data->trace_cpu, &tracing_fops);  	trace_create_file("trace_pipe_raw", 0444, d_cpu, -			(void *) cpu, &tracing_buffers_fops); +			(void *)&data->trace_cpu, &tracing_buffers_fops);  	trace_create_file("stats", 0444, d_cpu, -			(void *) cpu, &tracing_stats_fops); +			(void *)&data->trace_cpu, &tracing_stats_fops);  	trace_create_file("buffer_size_kb", 0444, d_cpu, -			(void *) cpu, &tracing_entries_fops); +			(void *)&data->trace_cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_cpu, +			  (void *)&data->trace_cpu, &snapshot_fops); + +	trace_create_file("snapshot_raw", 0444, d_cpu, +			(void *)&data->trace_cpu, &snapshot_raw_fops); +#endif  }  #ifdef CONFIG_FTRACE_SELFTEST @@ -4725,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)  struct trace_option_dentry {  	struct tracer_opt		*opt;  	struct tracer_flags		*flags; +	struct trace_array		*tr;  	struct dentry			*entry;  }; @@ -4760,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (!!(topt->flags->val & topt->opt->bit) != val) {  		mutex_lock(&trace_types_lock); -		ret = __set_tracer_option(current_trace, topt->flags, +		ret = __set_tracer_option(topt->tr->current_trace, topt->flags,  					  topt->opt, !val);  		mutex_unlock(&trace_types_lock);  		if (ret) @@ -4799,6 +5438,7 @@ static ssize_t  trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  			 loff_t *ppos)  { +	struct trace_array *tr = &global_trace;  	long index = (long)filp->private_data;  	unsigned long val;  	int ret; @@ -4811,7 +5451,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  		return -EINVAL;  	mutex_lock(&trace_types_lock); -	ret = set_tracer_flag(1 << index, val); +	ret = set_tracer_flag(tr, 1 << index, val);  	mutex_unlock(&trace_types_lock);  	if (ret < 0) @@ -4845,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,  } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr)  {  	struct dentry *d_tracer; -	static struct dentry *t_options; -	if (t_options) -		return t_options; +	if (tr->options) +		return tr->options; -	d_tracer = tracing_init_dentry(); +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	t_options = debugfs_create_dir("options", d_tracer); -	if (!t_options) { +	tr->options = debugfs_create_dir("options", d_tracer); +	if (!tr->options) {  		pr_warning("Could not create debugfs directory 'options'\n");  		return NULL;  	} -	return t_options; +	return tr->options;  }  static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, +			 struct trace_option_dentry *topt,  			 struct tracer_flags *flags,  			 struct tracer_opt *opt)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	topt->flags = flags;  	topt->opt = opt; +	topt->tr = tr;  	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,  				    &trace_options_fops); @@ -4886,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,  }  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer)  {  	struct trace_option_dentry *topts;  	struct tracer_flags *flags; @@ -4911,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)  		return NULL;  	for (cnt = 0; opts[cnt].name; cnt++) -		create_trace_option_file(&topts[cnt], flags, +		create_trace_option_file(tr, &topts[cnt], flags,  					 &opts[cnt]);  	return topts; @@ -4934,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)  }  static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, +			      const char *option, long index)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return NULL; @@ -4946,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)  				    &trace_options_core_fops);  } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr)  {  	struct dentry *t_options;  	int i; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	for (i = 0; trace_options[i]; i++) -		create_trace_option_core_file(trace_options[i], i); +		create_trace_option_core_file(tr, trace_options[i], i);  }  static ssize_t @@ -4964,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,  	       size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	char buf[64];  	int r; @@ -4983,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  		size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	unsigned long val;  	int ret; @@ -4995,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  		mutex_lock(&trace_types_lock);  		if (val) {  			ring_buffer_record_on(buffer); -			if (current_trace->start) -				current_trace->start(tr); +			if (tr->current_trace->start) +				tr->current_trace->start(tr);  		} else {  			ring_buffer_record_off(buffer); -			if (current_trace->stop) -				current_trace->stop(tr); +			if (tr->current_trace->stop) +				tr->current_trace->stop(tr);  		}  		mutex_unlock(&trace_types_lock);  	} @@ -5017,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {  	.llseek		= default_llseek,  }; +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +{ +	int cpu; + +	for_each_tracing_cpu(cpu) { +		memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); +		per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; +		per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; +	} +} + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ +	enum ring_buffer_flags rb_flags; + +	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + +	buf->buffer = ring_buffer_alloc(size, rb_flags); +	if (!buf->buffer) +		return -ENOMEM; + +	buf->data = alloc_percpu(struct trace_array_cpu); +	if (!buf->data) { +		ring_buffer_free(buf->buffer); +		return -ENOMEM; +	} + +	init_trace_buffers(tr, buf); + +	/* Allocate the first page for all buffers */ +	set_buffer_entries(&tr->trace_buffer, +			   ring_buffer_size(tr->trace_buffer.buffer, 0)); + +	return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ +	int ret; + +	ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); +	if (ret) +		return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE +	ret = allocate_trace_buffer(tr, &tr->max_buffer, +				    allocate_snapshot ? size : 1); +	if (WARN_ON(ret)) { +		ring_buffer_free(tr->trace_buffer.buffer); +		free_percpu(tr->trace_buffer.data); +		return -ENOMEM; +	} +	tr->allocated_snapshot = allocate_snapshot; + +	/* +	 * Only the top level trace array gets its snapshot allocated +	 * from the kernel command line. +	 */ +	allocate_snapshot = false; +#endif +	return 0; +} + +static int new_instance_create(const char *name) +{ +	struct trace_array *tr; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -EEXIST; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) +			goto out_unlock; +	} + +	ret = -ENOMEM; +	tr = kzalloc(sizeof(*tr), GFP_KERNEL); +	if (!tr) +		goto out_unlock; + +	tr->name = kstrdup(name, GFP_KERNEL); +	if (!tr->name) +		goto out_free_tr; + +	raw_spin_lock_init(&tr->start_lock); + +	tr->current_trace = &nop_trace; + +	INIT_LIST_HEAD(&tr->systems); +	INIT_LIST_HEAD(&tr->events); + +	if (allocate_trace_buffers(tr, trace_buf_size) < 0) +		goto out_free_tr; + +	/* Holder for file callbacks */ +	tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; +	tr->trace_cpu.tr = tr; + +	tr->dir = debugfs_create_dir(name, trace_instance_dir); +	if (!tr->dir) +		goto out_free_tr; + +	ret = event_trace_add_tracer(tr->dir, tr); +	if (ret) +		goto out_free_tr; + +	init_tracer_debugfs(tr, tr->dir); + +	list_add(&tr->list, &ftrace_trace_arrays); + +	mutex_unlock(&trace_types_lock); + +	return 0; + + out_free_tr: +	if (tr->trace_buffer.buffer) +		ring_buffer_free(tr->trace_buffer.buffer); +	kfree(tr->name); +	kfree(tr); + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; + +} + +static int instance_delete(const char *name) +{ +	struct trace_array *tr; +	int found = 0; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -ENODEV; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) { +			found = 1; +			break; +		} +	} +	if (!found) +		goto out_unlock; + +	ret = -EBUSY; +	if (tr->ref) +		goto out_unlock; + +	list_del(&tr->list); + +	event_trace_del_tracer(tr); +	debugfs_remove_recursive(tr->dir); +	free_percpu(tr->trace_buffer.data); +	ring_buffer_free(tr->trace_buffer.buffer); + +	kfree(tr->name); +	kfree(tr); + +	ret = 0; + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the new_instance_create() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = new_instance_create(dentry->d_iname); + +	mutex_lock(&inode->i_mutex); + +	return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* The caller did a dget() on dentry */ +	mutex_unlock(&dentry->d_inode->i_mutex); + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the instance_delete() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = instance_delete(dentry->d_iname); + +	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); +	mutex_lock(&dentry->d_inode->i_mutex); + +	return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { +	.lookup		= simple_lookup, +	.mkdir		= instance_mkdir, +	.rmdir		= instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ +	trace_instance_dir = debugfs_create_dir("instances", d_tracer); +	if (WARN_ON(!trace_instance_dir)) +		return; + +	/* Hijack the dir inode operations, to allow mkdir */ +	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ +	int cpu; + +	trace_create_file("trace_options", 0644, d_tracer, +			  tr, &tracing_iter_fops); + +	trace_create_file("trace", 0644, d_tracer, +			(void *)&tr->trace_cpu, &tracing_fops); + +	trace_create_file("trace_pipe", 0444, d_tracer, +			(void *)&tr->trace_cpu, &tracing_pipe_fops); + +	trace_create_file("buffer_size_kb", 0644, d_tracer, +			(void *)&tr->trace_cpu, &tracing_entries_fops); + +	trace_create_file("buffer_total_size_kb", 0444, d_tracer, +			  tr, &tracing_total_entries_fops); + +	trace_create_file("free_buffer", 0644, d_tracer, +			  tr, &tracing_free_buffer_fops); + +	trace_create_file("trace_marker", 0220, d_tracer, +			  tr, &tracing_mark_fops); + +	trace_create_file("trace_clock", 0644, d_tracer, tr, +			  &trace_clock_fops); + +	trace_create_file("tracing_on", 0644, d_tracer, +			    tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_tracer, +			  (void *)&tr->trace_cpu, &snapshot_fops); +#endif + +	for_each_tracing_cpu(cpu) +		tracing_init_debugfs_percpu(tr, cpu); + +} +  static __init int tracer_init_debugfs(void)  {  	struct dentry *d_tracer; -	int cpu;  	trace_access_lock_init();  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; -	trace_create_file("trace_options", 0644, d_tracer, -			NULL, &tracing_iter_fops); +	init_tracer_debugfs(&global_trace, d_tracer);  	trace_create_file("tracing_cpumask", 0644, d_tracer, -			NULL, &tracing_cpumask_fops); - -	trace_create_file("trace", 0644, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops); +			&global_trace, &tracing_cpumask_fops);  	trace_create_file("available_tracers", 0444, d_tracer,  			&global_trace, &show_traces_fops); @@ -5052,44 +5981,17 @@ static __init int tracer_init_debugfs(void)  	trace_create_file("README", 0444, d_tracer,  			NULL, &tracing_readme_fops); -	trace_create_file("trace_pipe", 0444, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - -	trace_create_file("buffer_size_kb", 0644, d_tracer, -			(void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); - -	trace_create_file("buffer_total_size_kb", 0444, d_tracer, -			&global_trace, &tracing_total_entries_fops); - -	trace_create_file("free_buffer", 0644, d_tracer, -			&global_trace, &tracing_free_buffer_fops); - -	trace_create_file("trace_marker", 0220, d_tracer, -			NULL, &tracing_mark_fops); -  	trace_create_file("saved_cmdlines", 0444, d_tracer,  			NULL, &tracing_saved_cmdlines_fops); -	trace_create_file("trace_clock", 0644, d_tracer, NULL, -			  &trace_clock_fops); - -	trace_create_file("tracing_on", 0644, d_tracer, -			    &global_trace, &rb_simple_fops); -  #ifdef CONFIG_DYNAMIC_FTRACE  	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,  			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);  #endif -#ifdef CONFIG_TRACER_SNAPSHOT -	trace_create_file("snapshot", 0644, d_tracer, -			  (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); -#endif +	create_trace_instances(d_tracer); -	create_trace_options_dir(); - -	for_each_tracing_cpu(cpu) -		tracing_init_debugfs_percpu(cpu); +	create_trace_options_dir(&global_trace);  	return 0;  } @@ -5145,8 +6047,8 @@ void  trace_printk_seq(struct trace_seq *s)  {  	/* Probably should print a warning here. */ -	if (s->len >= 1000) -		s->len = 1000; +	if (s->len >= TRACE_MAX_PRINT) +		s->len = TRACE_MAX_PRINT;  	/* should be zero ended, but we are paranoid. */  	s->buffer[s->len] = 0; @@ -5159,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)  void trace_init_global_iter(struct trace_iterator *iter)  {  	iter->tr = &global_trace; -	iter->trace = current_trace; -	iter->cpu_file = TRACE_PIPE_ALL_CPU; +	iter->trace = iter->tr->current_trace; +	iter->cpu_file = RING_BUFFER_ALL_CPUS; +	iter->trace_buffer = &global_trace.trace_buffer;  } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)  { -	static arch_spinlock_t ftrace_dump_lock = -		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	/* use static because iter can be a bit big for the stack */  	static struct trace_iterator iter; +	static atomic_t dump_running;  	unsigned int old_userobj; -	static int dump_ran;  	unsigned long flags;  	int cnt = 0, cpu; -	/* only one dump */ -	local_irq_save(flags); -	arch_spin_lock(&ftrace_dump_lock); -	if (dump_ran) -		goto out; - -	dump_ran = 1; +	/* Only allow one dump user at a time. */ +	if (atomic_inc_return(&dump_running) != 1) { +		atomic_dec(&dump_running); +		return; +	} +	/* +	 * Always turn off tracing when we dump. +	 * We don't need to show trace output of what happens +	 * between multiple crashes. +	 * +	 * If the user does a sysrq-z, then they can re-enable +	 * tracing with echo 1 > tracing_on. +	 */  	tracing_off(); -	/* Did function tracer already get disabled? */ -	if (ftrace_is_dead()) { -		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); -		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); -	} - -	if (disable_tracing) -		ftrace_kill(); +	local_irq_save(flags);  	/* Simulate the iterator */  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);  	}  	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5208,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  	switch (oops_dump_mode) {  	case DUMP_ALL: -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  		break;  	case DUMP_ORIG:  		iter.cpu_file = raw_smp_processor_id(); @@ -5217,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		goto out_enable;  	default:  		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  	}  	printk(KERN_TRACE "Dumping ftrace buffer:\n"); +	/* Did function tracer already get disabled? */ +	if (ftrace_is_dead()) { +		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); +		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); +	} +  	/*  	 * We need to stop all tracing on all CPUS to read the  	 * the next buffer. This is a bit expensive, but is @@ -5261,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		printk(KERN_TRACE "---------------------------------\n");   out_enable: -	/* Re-enable tracing if requested */ -	if (!disable_tracing) { -		trace_flags |= old_userobj; +	trace_flags |= old_userobj; -		for_each_tracing_cpu(cpu) { -			atomic_dec(&iter.tr->data[cpu]->disabled); -		} -		tracing_on(); +	for_each_tracing_cpu(cpu) { +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	} - - out: -	arch_spin_unlock(&ftrace_dump_lock); + 	atomic_dec(&dump_running);  	local_irq_restore(flags);  } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ -	__ftrace_dump(true, oops_dump_mode); -}  EXPORT_SYMBOL_GPL(ftrace_dump);  __init static int tracer_alloc_buffers(void)  {  	int ring_buf_size; -	enum ring_buffer_flags rb_flags; -	int i;  	int ret = -ENOMEM; @@ -5308,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)  	else  		ring_buf_size = 1; -	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; -  	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);  	cpumask_copy(tracing_cpumask, cpu_all_mask); +	raw_spin_lock_init(&global_trace.start_lock); +  	/* TODO: make the number of buffers hot pluggable with CPUS */ -	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); -	if (!global_trace.buffer) { +	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {  		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");  		WARN_ON(1);  		goto out_free_cpumask;  	} +  	if (global_trace.buffer_disabled)  		tracing_off(); - -#ifdef CONFIG_TRACER_MAX_TRACE -	max_tr.buffer = ring_buffer_alloc(1, rb_flags); -	if (!max_tr.buffer) { -		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); -		WARN_ON(1); -		ring_buffer_free(global_trace.buffer); -		goto out_free_cpumask; -	} -#endif - -	/* Allocate the first page for all buffers */ -	for_each_tracing_cpu(i) { -		global_trace.data[i] = &per_cpu(global_trace_cpu, i); -		max_tr.data[i] = &per_cpu(max_tr_data, i); -	} - -	set_buffer_entries(&global_trace, -			   ring_buffer_size(global_trace.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE -	set_buffer_entries(&max_tr, 1); -#endif -  	trace_init_cmdlines(); -	init_irq_work(&trace_work_wakeup, trace_wake_up);  	register_tracer(&nop_trace); +	global_trace.current_trace = &nop_trace; +  	/* All seems OK, enable tracing */  	tracing_disabled = 0; @@ -5359,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)  	register_die_notifier(&trace_die_notifier); +	global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + +	/* Holder for file callbacks */ +	global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; +	global_trace.trace_cpu.tr = &global_trace; + +	INIT_LIST_HEAD(&global_trace.systems); +	INIT_LIST_HEAD(&global_trace.events); +	list_add(&global_trace.list, &ftrace_trace_arrays); +  	while (trace_boot_options) {  		char *option;  		option = strsep(&trace_boot_options, ","); -		trace_set_options(option); +		trace_set_options(&global_trace, option);  	} +	register_snapshot_cmd(); +  	return 0;  out_free_cpumask: +	free_percpu(global_trace.trace_buffer.data); +#ifdef CONFIG_TRACER_MAX_TRACE +	free_percpu(global_trace.max_buffer.data); +#endif  	free_cpumask_var(tracing_cpumask);  out_free_buffer_mask:  	free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2081971367e..711ca7d3e7f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,11 @@  #include <linux/trace_seq.h>  #include <linux/ftrace_event.h> +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h>		/* For NR_SYSCALLS	     */ +#include <asm/syscall.h>	/* some archs define it here */ +#endif +  enum trace_type {  	__TRACE_FIRST_TYPE = 0, @@ -29,6 +34,7 @@ enum trace_type {  	TRACE_GRAPH_ENT,  	TRACE_USER_STACK,  	TRACE_BLK, +	TRACE_BPUTS,  	__TRACE_LAST_TYPE,  }; @@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {  	unsigned long		ret_ip;  }; -struct uprobe_trace_entry_head { -	struct trace_entry	ent; -	unsigned long		ip; -}; -  /*   * trace_flag_type is an enumeration that holds different   * states when a trace occurs. These are: @@ -127,12 +128,21 @@ enum trace_flag_type {  #define TRACE_BUF_SIZE		1024 +struct trace_array; + +struct trace_cpu { +	struct trace_array	*tr; +	struct dentry		*dir; +	int			cpu; +}; +  /*   * The CPU trace array - it consists of thousands of trace entries   * plus some other descriptor data: (for example which task started   * the trace, etc.)   */  struct trace_array_cpu { +	struct trace_cpu	trace_cpu;  	atomic_t		disabled;  	void			*buffer_page;	/* ring buffer spare */ @@ -151,20 +161,83 @@ struct trace_array_cpu {  	char			comm[TASK_COMM_LEN];  }; +struct tracer; + +struct trace_buffer { +	struct trace_array		*tr; +	struct ring_buffer		*buffer; +	struct trace_array_cpu __percpu	*data; +	cycle_t				time_start; +	int				cpu; +}; +  /*   * The trace array - an array of per-CPU trace arrays. This is the   * highest level data structure that individual tracers deal with.   * They have on/off state as well:   */  struct trace_array { -	struct ring_buffer	*buffer; -	int			cpu; +	struct list_head	list; +	char			*name; +	struct trace_buffer	trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	/* +	 * The max_buffer is used to snapshot the trace when a maximum +	 * latency is reached, or when the user initiates a snapshot. +	 * Some tracers will use this to store a maximum trace while +	 * it continues examining live traces. +	 * +	 * The buffers for the max_buffer are set up the same as the trace_buffer +	 * When a snapshot is taken, the buffer of the max_buffer is swapped +	 * with the buffer of the trace_buffer and the buffers are reset for +	 * the trace_buffer so the tracing can continue. +	 */ +	struct trace_buffer	max_buffer; +	bool			allocated_snapshot; +#endif  	int			buffer_disabled; -	cycle_t			time_start; +	struct trace_cpu	trace_cpu;	/* place holder */ +#ifdef CONFIG_FTRACE_SYSCALLS +	int			sys_refcount_enter; +	int			sys_refcount_exit; +	DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +	DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +#endif +	int			stop_count; +	int			clock_id; +	struct tracer		*current_trace; +	unsigned int		flags; +	raw_spinlock_t		start_lock; +	struct dentry		*dir; +	struct dentry		*options; +	struct dentry		*percpu_dir; +	struct dentry		*event_dir; +	struct list_head	systems; +	struct list_head	events;  	struct task_struct	*waiter; -	struct trace_array_cpu	*data[NR_CPUS]; +	int			ref; +}; + +enum { +	TRACE_ARRAY_FL_GLOBAL	= (1 << 0)  }; +extern struct list_head ftrace_trace_arrays; + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ +	struct trace_array *tr; + +	tr = list_entry(ftrace_trace_arrays.prev, +			typeof(*tr), list); +	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); +	return tr; +} +  #define FTRACE_CMP_TYPE(var, type) \  	__builtin_types_compatible_p(typeof(var), type *) @@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\  		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\  		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\ +		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\  			  TRACE_MMIO_RW);				\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\ @@ -289,9 +363,10 @@ struct tracer {  	struct tracer		*next;  	struct tracer_flags	*flags;  	bool			print_max; -	bool			use_max_tr; -	bool			allocated_snapshot;  	bool			enabled; +#ifdef CONFIG_TRACER_MAX_TRACE +	bool			use_max_tr; +#endif  }; @@ -427,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)  	current->trace_recursion = val;  } -#define TRACE_PIPE_ALL_CPU	-1 -  static inline struct ring_buffer_iter *  trace_buffer_iter(struct trace_iterator *iter, int cpu)  { @@ -439,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)  int tracer_init(struct tracer *t, struct trace_array *tr);  int tracing_is_enabled(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf);  void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp);  struct dentry *trace_create_file(const char *name,  				 umode_t mode, @@ -450,6 +523,7 @@ struct dentry *trace_create_file(const char *name,  				 void *data,  				 const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr);  struct dentry *tracing_init_dentry(void);  struct ring_buffer_event; @@ -583,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);  #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2  extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded;  extern bool tracing_selftest_disabled;  DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -619,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,  		    unsigned long ip, const char *fmt, va_list args);  int trace_array_printk(struct trace_array *tr,  		       unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...);  void trace_printk_seq(struct trace_seq *s);  enum print_line_t print_trace_line(struct trace_iterator *iter); @@ -786,6 +862,7 @@ enum trace_iterator_flags {  	TRACE_ITER_STOP_ON_FREE		= 0x400000,  	TRACE_ITER_IRQ_INFO		= 0x800000,  	TRACE_ITER_MARKERS		= 0x1000000, +	TRACE_ITER_FUNCTION		= 0x2000000,  };  /* @@ -832,8 +909,8 @@ enum {  struct ftrace_event_field {  	struct list_head	link; -	char			*name; -	char			*type; +	const char		*name; +	const char		*type;  	int			filter_type;  	int			offset;  	int			size; @@ -851,12 +928,19 @@ struct event_filter {  struct event_subsystem {  	struct list_head	list;  	const char		*name; -	struct dentry		*entry;  	struct event_filter	*filter; -	int			nr_events;  	int			ref_count;  }; +struct ftrace_subsystem_dir { +	struct list_head		list; +	struct event_subsystem		*subsystem; +	struct trace_array		*tr; +	struct dentry			*entry; +	int				ref_count; +	int				nr_events; +}; +  #define FILTER_PRED_INVALID	((unsigned short)-1)  #define FILTER_PRED_IS_RIGHT	(1 << 15)  #define FILTER_PRED_FOLD	(1 << 15) @@ -906,22 +990,20 @@ struct filter_pred {  	unsigned short		right;  }; -extern struct list_head ftrace_common_fields; -  extern enum regex_type  filter_parse_regex(char *buff, int len, char **search, int *not);  extern void print_event_filter(struct ftrace_event_call *call,  			       struct trace_seq *s);  extern int apply_event_filter(struct ftrace_event_call *call,  			      char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  					char *filter_string);  extern void print_subsystem_event_filter(struct event_subsystem *system,  					 struct trace_seq *s);  extern int filter_assign_type(const char *type); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name);  static inline int  filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -938,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,  }  extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr);  extern struct mutex event_mutex;  extern struct list_head ftrace_events; @@ -948,7 +1032,18 @@ extern const char *__stop___trace_bprintk_fmt[];  void trace_printk_init_buffers(void);  void trace_printk_start_comm(void);  int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(unsigned int mask, int enabled); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed2..d594da0dc03 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  {  	struct ftrace_event_call *call = &event_branch;  	struct trace_array *tr = branch_tracer; +	struct trace_array_cpu *data;  	struct ring_buffer_event *event;  	struct trace_branch *entry;  	struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) +	data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	if (atomic_inc_return(&data->disabled) != 1)  		goto out;  	pc = preempt_count(); -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,  					  sizeof(*entry), flags, pc);  	if (!event) @@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  		__buffer_unlock_commit(buffer, event);   out: -	atomic_dec(&tr->data[cpu]->disabled); +	atomic_dec(&data->disabled);  	local_irq_restore(flags);  } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae..26dc348332b 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -57,6 +57,16 @@ u64 notrace trace_clock(void)  	return local_clock();  } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + */ +u64 notrace trace_clock_jiffies(void) +{ +	u64 jiffy = jiffies - INITIAL_JIFFIES; + +	/* Return nsecs */ +	return (u64)jiffies_to_usecs(jiffy) * 1000ULL; +}  /*   * trace_clock_global(): special globally coherent trace clock diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca..e2d027ac66a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,  		__dynamic_array(	u32,	buf	)  	), -	F_printk("%08lx fmt:%p", -		 __entry->ip, __entry->fmt), +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->fmt),  	FILTER_OTHER  ); @@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,  		__dynamic_array(	char,	buf	)  	), -	F_printk("%08lx %s", -		 __entry->ip, __entry->buf), +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->buf), + +	FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + +	TRACE_BPUTS, + +	F_STRUCT( +		__field(	unsigned long,	ip	) +		__field(	const char *,	str	) +	), + +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->str),  	FILTER_OTHER  ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250..7a0cf68027c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];  EXPORT_SYMBOL_GPL(event_storage);  LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); -struct list_head * +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		struct ftrace_event_file *___n;				\ +		list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file()		\ +	} + +static struct list_head *  trace_get_fields(struct ftrace_event_call *event_call)  {  	if (!event_call->class->get_fields) @@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)  	return event_call->class->get_fields(event_call);  } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ +	struct ftrace_event_field *field; + +	list_for_each_entry(field, head, link) { +		if (!strcmp(field->name, name)) +			return field; +	} + +	return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ +	struct ftrace_event_field *field; +	struct list_head *head; + +	field = __find_event_field(&ftrace_common_fields, name); +	if (field) +		return field; + +	head = trace_get_fields(call); +	return __find_event_field(head, name); +} +  static int __trace_define_field(struct list_head *head, const char *type,  				const char *name, int offset, int size,  				int is_signed, int filter_type)  {  	struct ftrace_event_field *field; -	field = kzalloc(sizeof(*field), GFP_KERNEL); +	field = kmem_cache_alloc(field_cachep, GFP_TRACE);  	if (!field)  		goto err; -	field->name = kstrdup(name, GFP_KERNEL); -	if (!field->name) -		goto err; - -	field->type = kstrdup(type, GFP_KERNEL); -	if (!field->type) -		goto err; +	field->name = name; +	field->type = type;  	if (filter_type == FILTER_OTHER)  		field->filter_type = filter_assign_type(type); @@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,  	return 0;  err: -	if (field) -		kfree(field->name); -	kfree(field); +	kmem_cache_free(field_cachep, field);  	return -ENOMEM;  } @@ -120,7 +158,7 @@ static int trace_define_common_fields(void)  	return ret;  } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call)  {  	struct ftrace_event_field *field, *next;  	struct list_head *head; @@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)  	head = trace_get_fields(call);  	list_for_each_entry_safe(field, next, head, link) {  		list_del(&field->link); -		kfree(field->type); -		kfree(field->name); -		kfree(field); +		kmem_cache_free(field_cachep, field);  	}  } @@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);  int ftrace_event_reg(struct ftrace_event_call *call,  		     enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER:  		return tracepoint_probe_register(call->name,  						 call->class->probe, -						 call); +						 file);  	case TRACE_REG_UNREGISTER:  		tracepoint_probe_unregister(call->name,  					    call->class->probe, -					    call); +					    file);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -183,54 +221,106 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);  void trace_event_enable_cmd_record(bool enable)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) +	do_for_each_event_file(tr, file) { + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))  			continue;  		if (enable) {  			tracing_start_cmdline_record(); -			call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} else {  			tracing_stop_cmdline_record(); -			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} -	} +	} while_for_each_event_file();  	mutex_unlock(&event_mutex);  } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, -					int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, +					 int enable, int soft_disable)  { +	struct ftrace_event_call *call = file->event_call;  	int ret = 0; +	int disable;  	switch (enable) {  	case 0: -		if (call->flags & TRACE_EVENT_FL_ENABLED) { -			call->flags &= ~TRACE_EVENT_FL_ENABLED; -			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { +		/* +		 * When soft_disable is set and enable is cleared, the sm_ref +		 * reference counter is decremented. If it reaches 0, we want +		 * to clear the SOFT_DISABLED flag but leave the event in the +		 * state that it was. That is, if the event was enabled and +		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED +		 * is set we do not want the event to be enabled before we +		 * clear the bit. +		 * +		 * When soft_disable is not set but the SOFT_MODE flag is, +		 * we do nothing. Do not disable the tracepoint, otherwise +		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. +		 */ +		if (soft_disable) { +			if (atomic_dec_return(&file->sm_ref) > 0) +				break; +			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; +			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); +		} else +			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + +		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { +			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); +			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {  				tracing_stop_cmdline_record(); -				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			call->class->reg(call, TRACE_REG_UNREGISTER, NULL); +			call->class->reg(call, TRACE_REG_UNREGISTER, file);  		} +		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ +		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) +			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);  		break;  	case 1: -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { +		/* +		 * When soft_disable is set and enable is set, we want to +		 * register the tracepoint for the event, but leave the event +		 * as is. That means, if the event was already enabled, we do +		 * nothing (but set SOFT_MODE). If the event is disabled, we +		 * set SOFT_DISABLED before enabling the event tracepoint, so +		 * it still seems to be disabled. +		 */ +		if (!soft_disable) +			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +		else { +			if (atomic_inc_return(&file->sm_ref) > 1) +				break; +			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); +		} + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + +			/* Keep the event disabled, when going to SOFT_MODE. */ +			if (soft_disable) +				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +  			if (trace_flags & TRACE_ITER_RECORD_CMD) {  				tracing_start_cmdline_record(); -				call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); +			ret = call->class->reg(call, TRACE_REG_REGISTER, file);  			if (ret) {  				tracing_stop_cmdline_record();  				pr_info("event trace: Could not enable event "  					"%s\n", call->name);  				break;  			} -			call->flags |= TRACE_EVENT_FL_ENABLED; +			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + +			/* WAS_ENABLED gets set but never cleared. */ +			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;  		}  		break;  	} @@ -238,13 +328,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,  	return ret;  } -static void ftrace_clear_events(void) +static int ftrace_event_enable_disable(struct ftrace_event_file *file, +				       int enable)  { -	struct ftrace_event_call *call; +	return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ +	struct ftrace_event_file *file;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		ftrace_event_enable_disable(call, 0); +	list_for_each_entry(file, &tr->events, list) { +		ftrace_event_enable_disable(file, 0);  	}  	mutex_unlock(&event_mutex);  } @@ -257,11 +353,12 @@ static void __put_system(struct event_subsystem *system)  	if (--system->ref_count)  		return; +	list_del(&system->list); +  	if (filter) {  		kfree(filter->filter_string);  		kfree(filter);  	} -	kfree(system->name);  	kfree(system);  } @@ -271,24 +368,45 @@ static void __get_system(struct event_subsystem *system)  	system->ref_count++;  } -static void put_system(struct event_subsystem *system) +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	dir->ref_count++; +	__get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	/* If the subsystem is about to be freed, the dir must be too */ +	WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + +	__put_system(dir->subsystem); +	if (!--dir->ref_count) +		kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir)  {  	mutex_lock(&event_mutex); -	__put_system(system); +	__put_system_dir(dir);  	mutex_unlock(&event_mutex);  }  /*   * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.   */ -static int __ftrace_set_clr_event(const char *match, const char *sub, -				  const char *event, int set) +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, +				  const char *sub, const char *event, int set)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	int ret = -EINVAL;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		if (!call->name || !call->class || !call->class->reg)  			continue; @@ -307,7 +425,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,  		if (event && strcmp(event, call->name) != 0)  			continue; -		ftrace_event_enable_disable(call, set); +		ftrace_event_enable_disable(file, set);  		ret = 0;  	} @@ -316,7 +434,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,  	return ret;  } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)  {  	char *event = NULL, *sub = NULL, *match; @@ -344,7 +462,7 @@ static int ftrace_set_clr_event(char *buf, int set)  			event = NULL;  	} -	return __ftrace_set_clr_event(match, sub, event, set); +	return __ftrace_set_clr_event(tr, match, sub, event, set);  }  /** @@ -361,7 +479,9 @@ static int ftrace_set_clr_event(char *buf, int set)   */  int trace_set_clr_event(const char *system, const char *event, int set)  { -	return __ftrace_set_clr_event(NULL, system, event, set); +	struct trace_array *tr = top_trace_array(); + +	return __ftrace_set_clr_event(tr, NULL, system, event, set);  }  EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -373,6 +493,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  {  	struct trace_parser parser; +	struct seq_file *m = file->private_data; +	struct trace_array *tr = m->private;  	ssize_t read, ret;  	if (!cnt) @@ -395,7 +517,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		parser.buffer[parser.idx] = 0; -		ret = ftrace_set_clr_event(parser.buffer + !set, set); +		ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);  		if (ret)  			goto out_put;  	} @@ -411,17 +533,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { +	list_for_each_entry_continue(file, &tr->events, list) { +		call = file->event_call;  		/*  		 * The ftrace subsystem is for showing formats only.  		 * They can not be enabled or disabled via the event files.  		 */  		if (call->class && call->class->reg) -			return call; +			return file;  	}  	return NULL; @@ -429,30 +554,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  static void *t_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = t_next(m, call, &l); -		if (!call) +		file = t_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static void *  s_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { -		if (call->flags & TRACE_EVENT_FL_ENABLED) -			return call; +	list_for_each_entry_continue(file, &tr->events, list) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) +			return file;  	}  	return NULL; @@ -460,23 +587,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)  static void *s_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = s_next(m, call, &l); -		if (!call) +		file = s_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static int t_show(struct seq_file *m, void *v)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call = file->event_call;  	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)  		seq_printf(m, "%s:", call->class->system); @@ -494,25 +623,33 @@ static ssize_t  event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file = filp->private_data;  	char *buf; -	if (call->flags & TRACE_EVENT_FL_ENABLED) -		buf = "1\n"; -	else +	if (file->flags & FTRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) +			buf = "0*\n"; +		else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) +			buf = "1*\n"; +		else +			buf = "1\n"; +	} else  		buf = "0\n"; -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));  }  static ssize_t  event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file = filp->private_data;  	unsigned long val;  	int ret; +	if (!file) +		return -EINVAL; +  	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);  	if (ret)  		return ret; @@ -525,7 +662,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	case 0:  	case 1:  		mutex_lock(&event_mutex); -		ret = ftrace_event_enable_disable(call, val); +		ret = ftrace_event_enable_disable(file, val);  		mutex_unlock(&event_mutex);  		break; @@ -543,14 +680,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		   loff_t *ppos)  {  	const char set_to_char[4] = { '?', '0', '1', 'X' }; -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = dir->tr;  	char buf[2];  	int set = 0;  	int ret;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (!call->name || !call->class || !call->class->reg)  			continue; @@ -562,7 +703,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		 * or if all events or cleared, or if we have  		 * a mixture.  		 */ -		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); +		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));  		/*  		 * If we have a mixture, no need to look further. @@ -584,7 +725,8 @@ static ssize_t  system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		    loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	const char *name = NULL;  	unsigned long val;  	ssize_t ret; @@ -607,7 +749,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (system)  		name = system->name; -	ret = __ftrace_set_clr_event(NULL, name, NULL, val); +	ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);  	if (ret)  		goto out; @@ -845,43 +987,75 @@ static LIST_HEAD(event_subsystems);  static int subsystem_open(struct inode *inode, struct file *filp)  {  	struct event_subsystem *system = NULL; +	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ +	struct trace_array *tr;  	int ret; -	if (!inode->i_private) -		goto skip_search; -  	/* Make sure the system still exists */  	mutex_lock(&event_mutex); -	list_for_each_entry(system, &event_subsystems, list) { -		if (system == inode->i_private) { -			/* Don't open systems with no events */ -			if (!system->nr_events) { -				system = NULL; -				break; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		list_for_each_entry(dir, &tr->systems, list) { +			if (dir == inode->i_private) { +				/* Don't open systems with no events */ +				if (dir->nr_events) { +					__get_system_dir(dir); +					system = dir->subsystem; +				} +				goto exit_loop;  			} -			__get_system(system); -			break;  		}  	} + exit_loop:  	mutex_unlock(&event_mutex); -	if (system != inode->i_private) +	if (!system)  		return -ENODEV; - skip_search: +	/* Some versions of gcc think dir can be uninitialized here */ +	WARN_ON(!dir); + +	ret = tracing_open_generic(inode, filp); +	if (ret < 0) +		put_system(dir); + +	return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_subsystem_dir *dir; +	struct trace_array *tr = inode->i_private; +	int ret; + +	/* Make a temporary dir that has no system but points to tr */ +	dir = kzalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		return -ENOMEM; + +	dir->tr = tr; +  	ret = tracing_open_generic(inode, filp); -	if (ret < 0 && system) -		put_system(system); +	if (ret < 0) +		kfree(dir); + +	filp->private_data = dir;  	return ret;  }  static int subsystem_release(struct inode *inode, struct file *file)  { -	struct event_subsystem *system = inode->i_private; +	struct ftrace_subsystem_dir *dir = file->private_data; -	if (system) -		put_system(system); +	/* +	 * If dir->subsystem is NULL, then this is a temporary +	 * descriptor that was made for a trace_array to enable +	 * all subsystems. +	 */ +	if (dir->subsystem) +		put_system(dir); +	else +		kfree(dir);  	return 0;  } @@ -890,7 +1064,8 @@ static ssize_t  subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		      loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct trace_seq *s;  	int r; @@ -915,7 +1090,7 @@ static ssize_t  subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data;  	char *buf;  	int err; @@ -932,7 +1107,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	}  	buf[cnt] = '\0'; -	err = apply_subsystem_event_filter(system, buf); +	err = apply_subsystem_event_filter(dir, buf);  	free_page((unsigned long) buf);  	if (err < 0)  		return err; @@ -1041,30 +1216,35 @@ static const struct file_operations ftrace_system_enable_fops = {  	.release = subsystem_release,  }; +static const struct file_operations ftrace_tr_enable_fops = { +	.open = system_tr_open, +	.read = system_enable_read, +	.write = system_enable_write, +	.llseek = default_llseek, +	.release = subsystem_release, +}; +  static const struct file_operations ftrace_show_header_fops = {  	.open = tracing_open_generic,  	.read = show_header,  	.llseek = default_llseek,  }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, +		  const struct seq_operations *seq_ops)  { -	static struct dentry *d_tracer; -	static struct dentry *d_events; - -	if (d_events) -		return d_events; - -	d_tracer = tracing_init_dentry(); -	if (!d_tracer) -		return NULL; +	struct seq_file *m; +	int ret; -	d_events = debugfs_create_dir("events", d_tracer); -	if (!d_events) -		pr_warning("Could not create debugfs " -			   "'events' directory\n"); +	ret = seq_open(file, seq_ops); +	if (ret < 0) +		return ret; +	m = file->private_data; +	/* copy tr over to seq ops */ +	m->private = inode->i_private; -	return d_events; +	return ret;  }  static int @@ -1072,117 +1252,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_event_seq_ops; -	return seq_open(file, seq_ops); +	return ftrace_event_open(inode, file, seq_ops);  }  static int  ftrace_event_set_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_set_event_seq_ops; +	struct trace_array *tr = inode->i_private;  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) -		ftrace_clear_events(); +		ftrace_clear_events(tr); + +	return ftrace_event_open(inode, file, seq_ops); +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ +	struct event_subsystem *system; + +	/* need to create new entry */ +	system = kmalloc(sizeof(*system), GFP_KERNEL); +	if (!system) +		return NULL; + +	system->ref_count = 1; +	system->name = name; + +	system->filter = NULL; + +	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); +	if (!system->filter) +		goto out_free; -	return seq_open(file, seq_ops); +	list_add(&system->list, &event_subsystems); + +	return system; + + out_free: +	kfree(system); +	return NULL;  }  static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, +		    struct ftrace_event_file *file, struct dentry *parent)  { +	struct ftrace_subsystem_dir *dir;  	struct event_subsystem *system;  	struct dentry *entry;  	/* First see if we did not already create this dir */ -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { +		system = dir->subsystem;  		if (strcmp(system->name, name) == 0) { -			system->nr_events++; -			return system->entry; +			dir->nr_events++; +			file->system = dir; +			return dir->entry;  		}  	} -	/* need to create new entry */ -	system = kmalloc(sizeof(*system), GFP_KERNEL); -	if (!system) { -		pr_warning("No memory to create event subsystem %s\n", -			   name); -		return d_events; +	/* Now see if the system itself exists. */ +	list_for_each_entry(system, &event_subsystems, list) { +		if (strcmp(system->name, name) == 0) +			break;  	} +	/* Reset system variable when not found */ +	if (&system->list == &event_subsystems) +		system = NULL; -	system->entry = debugfs_create_dir(name, d_events); -	if (!system->entry) { -		pr_warning("Could not create event subsystem %s\n", -			   name); -		kfree(system); -		return d_events; -	} +	dir = kmalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		goto out_fail; -	system->nr_events = 1; -	system->ref_count = 1; -	system->name = kstrdup(name, GFP_KERNEL); -	if (!system->name) { -		debugfs_remove(system->entry); -		kfree(system); -		return d_events; +	if (!system) { +		system = create_new_subsystem(name); +		if (!system) +			goto out_free; +	} else +		__get_system(system); + +	dir->entry = debugfs_create_dir(name, parent); +	if (!dir->entry) { +		pr_warning("Failed to create system directory %s\n", name); +		__put_system(system); +		goto out_free;  	} -	list_add(&system->list, &event_subsystems); - -	system->filter = NULL; - -	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); -	if (!system->filter) { -		pr_warning("Could not allocate filter for subsystem " -			   "'%s'\n", name); -		return system->entry; -	} +	dir->tr = tr; +	dir->ref_count = 1; +	dir->nr_events = 1; +	dir->subsystem = system; +	file->system = dir; -	entry = debugfs_create_file("filter", 0644, system->entry, system, +	entry = debugfs_create_file("filter", 0644, dir->entry, dir,  				    &ftrace_subsystem_filter_fops);  	if (!entry) {  		kfree(system->filter);  		system->filter = NULL; -		pr_warning("Could not create debugfs " -			   "'%s/filter' entry\n", name); +		pr_warning("Could not create debugfs '%s/filter' entry\n", name);  	} -	trace_create_file("enable", 0644, system->entry, system, +	trace_create_file("enable", 0644, dir->entry, dir,  			  &ftrace_system_enable_fops); -	return system->entry; +	list_add(&dir->list, &tr->systems); + +	return dir->entry; + + out_free: +	kfree(dir); + out_fail: +	/* Only print this message if failed on memory allocation */ +	if (!dir || !system) +		pr_warning("No memory to create event subsystem %s\n", +			   name); +	return NULL;  }  static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +event_create_dir(struct dentry *parent, +		 struct ftrace_event_file *file,  		 const struct file_operations *id,  		 const struct file_operations *enable,  		 const struct file_operations *filter,  		 const struct file_operations *format)  { +	struct ftrace_event_call *call = file->event_call; +	struct trace_array *tr = file->tr;  	struct list_head *head; +	struct dentry *d_events;  	int ret;  	/*  	 * If the trace point header did not define TRACE_SYSTEM  	 * then the system would be called "TRACE_SYSTEM".  	 */ -	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) -		d_events = event_subsystem_dir(call->class->system, d_events); - -	call->dir = debugfs_create_dir(call->name, d_events); -	if (!call->dir) { -		pr_warning("Could not create debugfs " -			   "'%s' directory\n", call->name); +	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { +		d_events = event_subsystem_dir(tr, call->class->system, file, parent); +		if (!d_events) +			return -ENOMEM; +	} else +		d_events = parent; + +	file->dir = debugfs_create_dir(call->name, d_events); +	if (!file->dir) { +		pr_warning("Could not create debugfs '%s' directory\n", +			   call->name);  		return -1;  	}  	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) -		trace_create_file("enable", 0644, call->dir, call, +		trace_create_file("enable", 0644, file->dir, file,  				  enable);  #ifdef CONFIG_PERF_EVENTS  	if (call->event.type && call->class->reg) -		trace_create_file("id", 0444, call->dir, call, +		trace_create_file("id", 0444, file->dir, call,  		 		  id);  #endif @@ -1196,23 +1424,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,  		if (ret < 0) {  			pr_warning("Could not initialize trace point"  				   " events/%s\n", call->name); -			return ret; +			return -1;  		}  	} -	trace_create_file("filter", 0644, call->dir, call, +	trace_create_file("filter", 0644, file->dir, call,  			  filter); -	trace_create_file("format", 0444, call->dir, call, +	trace_create_file("format", 0444, file->dir, call,  			  format);  	return 0;  } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ +	if (!dir) +		return; + +	if (!--dir->nr_events) { +		debugfs_remove_recursive(dir->entry); +		list_del(&dir->list); +		__put_system_dir(dir); +	} +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ +	struct ftrace_event_file *file; +	struct trace_array *tr; + +	do_for_each_event_file_safe(tr, file) { + +		if (file->event_call != call) +			continue; + +		list_del(&file->list); +		debugfs_remove_recursive(file->dir); +		remove_subsystem(file->system); +		kmem_cache_free(file_cachep, file); + +		/* +		 * The do_for_each_event_file_safe() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +} +  static void event_remove(struct ftrace_event_call *call)  { -	ftrace_event_enable_disable(call, 0); +	struct trace_array *tr; +	struct ftrace_event_file *file; + +	do_for_each_event_file(tr, file) { +		if (file->event_call != call) +			continue; +		ftrace_event_enable_disable(file, 0); +		/* +		 * The do_for_each_event_file() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +  	if (call->event.funcs)  		__unregister_ftrace_event(&call->event); +	remove_event_from_tracers(call);  	list_del(&call->list);  } @@ -1234,82 +1515,109 @@ static int event_init(struct ftrace_event_call *call)  }  static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, -		       const struct file_operations *id, -		       const struct file_operations *enable, -		       const struct file_operations *filter, -		       const struct file_operations *format) +__register_event(struct ftrace_event_call *call, struct module *mod)  { -	struct dentry *d_events;  	int ret;  	ret = event_init(call);  	if (ret < 0)  		return ret; -	d_events = event_trace_events_dir(); -	if (!d_events) -		return -ENOENT; - -	ret = event_create_dir(call, d_events, id, enable, filter, format); -	if (!ret) -		list_add(&call->list, &ftrace_events); +	list_add(&call->list, &ftrace_events);  	call->mod = mod; -	return ret; +	return 0; +} + +static struct ftrace_event_file * +trace_create_new_event(struct ftrace_event_call *call, +		       struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	file = kmem_cache_alloc(file_cachep, GFP_TRACE); +	if (!file) +		return NULL; + +	file->event_call = call; +	file->tr = tr; +	atomic_set(&file->sm_ref, 0); +	list_add(&file->list, &tr->events); + +	return file;  } +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, +		      struct trace_array *tr, +		      const struct file_operations *id, +		      const struct file_operations *enable, +		      const struct file_operations *filter, +		      const struct file_operations *format) +{ +	struct ftrace_event_file *file; + +	file = trace_create_new_event(call, tr); +	if (!file) +		return -ENOMEM; + +	return event_create_dir(tr->event_dir, file, id, enable, filter, format); +} + +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, +			    struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	file = trace_create_new_event(call, tr); +	if (!file) +		return -ENOMEM; + +	return 0; +} + +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call, +				   struct ftrace_module_file_ops *file_ops); +  /* Add an additional event_call dynamically */  int trace_add_event_call(struct ftrace_event_call *call)  {  	int ret;  	mutex_lock(&event_mutex); -	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, -				     &ftrace_enable_fops, -				     &ftrace_event_filter_fops, -				     &ftrace_event_format_fops); -	mutex_unlock(&event_mutex); -	return ret; -} - -static void remove_subsystem_dir(const char *name) -{ -	struct event_subsystem *system; -	if (strcmp(name, TRACE_SYSTEM) == 0) -		return; +	ret = __register_event(call, NULL); +	if (ret >= 0) +		__add_event_to_tracers(call, NULL); -	list_for_each_entry(system, &event_subsystems, list) { -		if (strcmp(system->name, name) == 0) { -			if (!--system->nr_events) { -				debugfs_remove_recursive(system->entry); -				list_del(&system->list); -				__put_system(system); -			} -			break; -		} -	} +	mutex_unlock(&event_mutex); +	return ret;  }  /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Must be called under locking both of event_mutex and trace_event_sem.   */  static void __trace_remove_event_call(struct ftrace_event_call *call)  {  	event_remove(call);  	trace_destroy_fields(call);  	destroy_preds(call); -	debugfs_remove_recursive(call->dir); -	remove_subsystem_dir(call->class->system);  }  /* Remove an event_call */  void trace_remove_event_call(struct ftrace_event_call *call)  {  	mutex_lock(&event_mutex); -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__trace_remove_event_call(call); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	mutex_unlock(&event_mutex);  } @@ -1336,6 +1644,26 @@ struct ftrace_module_file_ops {  };  static struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ +	/* +	 * As event_calls are added in groups by module, +	 * when we find one file_ops, we don't need to search for +	 * each call in that module, as the rest should be the +	 * same. Only search for a new one if the last one did +	 * not match. +	 */ +	if (file_ops && mod == file_ops->mod) +		return file_ops; + +	list_for_each_entry(file_ops, &ftrace_module_file_list, list) { +		if (file_ops->mod == mod) +			return file_ops; +	} +	return NULL; +} + +static struct ftrace_module_file_ops *  trace_create_file_ops(struct module *mod)  {  	struct ftrace_module_file_ops *file_ops; @@ -1386,9 +1714,8 @@ static void trace_module_add_events(struct module *mod)  		return;  	for_each_event(call, start, end) { -		__trace_add_event_call(*call, mod, -				       &file_ops->id, &file_ops->enable, -				       &file_ops->filter, &file_ops->format); +		__register_event(*call, mod); +		__add_event_to_tracers(*call, file_ops);  	}  } @@ -1396,12 +1723,13 @@ static void trace_module_remove_events(struct module *mod)  {  	struct ftrace_module_file_ops *file_ops;  	struct ftrace_event_call *call, *p; -	bool found = false; +	bool clear_trace = false; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	list_for_each_entry_safe(call, p, &ftrace_events, list) {  		if (call->mod == mod) { -			found = true; +			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) +				clear_trace = true;  			__trace_remove_event_call(call);  		}  	} @@ -1415,14 +1743,18 @@ static void trace_module_remove_events(struct module *mod)  		list_del(&file_ops->list);  		kfree(file_ops);  	} +	up_write(&trace_event_sem);  	/*  	 * It is safest to reset the ring buffer if the module being unloaded -	 * registered any events. +	 * registered any events that were used. The only worry is if +	 * a new module gets loaded, and takes on the same id as the events +	 * of this module. When printing out the buffer, traced events left +	 * over from this module may be passed to the new module events and +	 * unexpected results may occur.  	 */ -	if (found) -		tracing_reset_current_online_cpus(); -	up_write(&trace_event_mutex); +	if (clear_trace) +		tracing_reset_all_online_cpus();  }  static int trace_module_notify(struct notifier_block *self, @@ -1443,14 +1775,443 @@ static int trace_module_notify(struct notifier_block *self,  	return 0;  } + +static int +__trace_add_new_mod_event(struct ftrace_event_call *call, +			  struct trace_array *tr, +			  struct ftrace_module_file_ops *file_ops) +{ +	return __trace_add_new_event(call, tr, +				     &file_ops->id, &file_ops->enable, +				     &file_ops->filter, &file_ops->format); +} +  #else -static int trace_module_notify(struct notifier_block *self, -			       unsigned long val, void *data) +static inline struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ +	return NULL; +} +static inline int trace_module_notify(struct notifier_block *self, +				      unsigned long val, void *data)  {  	return 0;  } +static inline int +__trace_add_new_mod_event(struct ftrace_event_call *call, +			  struct trace_array *tr, +			  struct ftrace_module_file_ops *file_ops) +{ +	return -ENODEV; +}  #endif /* CONFIG_MODULES */ +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_module_file_ops *file_ops = NULL; +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		if (call->mod) { +			/* +			 * Directories for events by modules need to +			 * keep module ref counts when opened (as we don't +			 * want the module to disappear when reading one +			 * of these files). The file_ops keep account of +			 * the module ref count. +			 */ +			file_ops = find_ftrace_file_ops(file_ops, call->mod); +			if (!file_ops) +				continue; /* Warn? */ +			ret = __trace_add_new_mod_event(call, tr, file_ops); +			if (ret < 0) +				pr_warning("Could not create directory for event %s\n", +					   call->name); +			continue; +		} +		ret = __trace_add_new_event(call, tr, +					    &ftrace_event_id_fops, +					    &ftrace_enable_fops, +					    &ftrace_event_filter_fops, +					    &ftrace_event_format_fops); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   call->name); +	} +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR	"enable_event" +#define DISABLE_EVENT_STR	"disable_event" + +struct event_probe_data { +	struct ftrace_event_file	*file; +	unsigned long			count; +	int				ref; +	bool				enable; +}; + +static struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system,  const char *event) +{ +	struct ftrace_event_file *file; +	struct ftrace_event_call *call; + +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call; + +		if (!call->name || !call->class || !call->class->reg) +			continue; + +		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) +			continue; + +		if (strcmp(event, call->name) == 0 && +		    strcmp(system, call->class->system) == 0) +			return file; +	} +	return NULL; +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (data->enable) +		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +	else +		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (!data->count) +		return; + +	/* Skip if the event is in a state we want to switch to */ +	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		return; + +	if (data->count != -1) +		(data->count)--; + +	event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *_data) +{ +	struct event_probe_data *data = _data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "%s:%s:%s", +		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, +		   data->file->event_call->class->system, +		   data->file->event_call->name); + +	if (data->count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", data->count); + +	return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	data->ref++; +	return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (WARN_ON_ONCE(data->ref <= 0)) +		return; + +	data->ref--; +	if (!data->ref) { +		/* Remove the SOFT_MODE flag */ +		__ftrace_event_enable_disable(data->file, 0, 1); +		module_put(data->file->event_call->mod); +		kfree(data); +	} +	*pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, +		  char *glob, char *cmd, char *param, int enabled) +{ +	struct trace_array *tr = top_trace_array(); +	struct ftrace_event_file *file; +	struct ftrace_probe_ops *ops; +	struct event_probe_data *data; +	const char *system; +	const char *event; +	char *number; +	bool enable; +	int ret; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enabled) +		return -EINVAL; + +	if (!param) +		return -EINVAL; + +	system = strsep(¶m, ":"); +	if (!param) +		return -EINVAL; + +	event = strsep(¶m, ":"); + +	mutex_lock(&event_mutex); + +	ret = -EINVAL; +	file = find_event_file(tr, system, event); +	if (!file) +		goto out; + +	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + +	if (enable) +		ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; +	else +		ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		ret = 0; +		goto out; +	} + +	ret = -ENOMEM; +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out; + +	data->enable = enable; +	data->count = -1; +	data->file = file; + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	ret = -EINVAL; +	if (!strlen(number)) +		goto out_free; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, &data->count); +	if (ret) +		goto out_free; + + out_reg: +	/* Don't let event modules unload while probe registered */ +	ret = try_module_get(file->event_call->mod); +	if (!ret) +		goto out_free; + +	ret = __ftrace_event_enable_disable(file, 1, 1); +	if (ret < 0) +		goto out_put; +	ret = register_ftrace_function_probe(glob, ops, data); +	/* +	 * The above returns on success the # of functions enabled, +	 * but if it didn't find any functions it returns zero. +	 * Consider no functions a failure too. +	 */ +	if (!ret) { +		ret = -ENOENT; +		goto out_disable; +	} else if (ret < 0) +		goto out_disable; +	/* Just return zero, not the number of enabled functions */ +	ret = 0; + out: +	mutex_unlock(&event_mutex); +	return ret; + + out_disable: +	__ftrace_event_enable_disable(file, 0, 1); + out_put: +	module_put(file->event_call->mod); + out_free: +	kfree(data); +	goto out; +} + +static struct ftrace_func_command event_enable_cmd = { +	.name			= ENABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { +	.name			= DISABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static __init int register_event_cmds(void) +{ +	int ret; + +	ret = register_ftrace_command(&event_enable_cmd); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = register_ftrace_command(&event_disable_cmd); +	if (WARN_ON(ret < 0)) +		unregister_ftrace_command(&event_enable_cmd); +	return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file; +	int ret; + + +	list_for_each_entry(file, &tr->events, list) { +		ret = event_create_dir(tr->event_dir, file, +				       &ftrace_event_id_fops, +				       &ftrace_enable_fops, +				       &ftrace_event_filter_fops, +				       &ftrace_event_format_fops); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   file->event_call->name); +	} +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		/* Early boot up should not have any modules loaded */ +		if (WARN_ON_ONCE(call->mod)) +			continue; + +		ret = __trace_early_add_new_event(call, tr); +		if (ret < 0) +			pr_warning("Could not create early event %s\n", +				   call->name); +	} +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file, *next; + +	list_for_each_entry_safe(file, next, &tr->events, list) { +		list_del(&file->list); +		debugfs_remove_recursive(file->dir); +		remove_subsystem(file->system); +		kmem_cache_free(file_cachep, file); +	} +} + +static void +__add_event_to_tracers(struct ftrace_event_call *call, +		       struct ftrace_module_file_ops *file_ops) +{ +	struct trace_array *tr; + +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (file_ops) +			__trace_add_new_mod_event(call, tr, file_ops); +		else +			__trace_add_new_event(call, tr, +					      &ftrace_event_id_fops, +					      &ftrace_enable_fops, +					      &ftrace_event_filter_fops, +					      &ftrace_event_format_fops); +	} +} +  static struct notifier_block trace_module_nb = {  	.notifier_call = trace_module_notify,  	.priority = 0, @@ -1464,15 +2225,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;  static __init int setup_trace_event(char *str)  {  	strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); -	ring_buffer_expanded = 1; -	tracing_selftest_disabled = 1; +	ring_buffer_expanded = true; +	tracing_selftest_disabled = true;  	return 1;  }  __setup("trace_event=", setup_trace_event); +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ +	struct dentry *d_events; +	struct dentry *entry; + +	entry = debugfs_create_file("set_event", 0644, parent, +				    tr, &ftrace_set_event_fops); +	if (!entry) { +		pr_warning("Could not create debugfs 'set_event' entry\n"); +		return -ENOMEM; +	} + +	d_events = debugfs_create_dir("events", parent); +	if (!d_events) { +		pr_warning("Could not create debugfs 'events' directory\n"); +		return -ENOMEM; +	} + +	/* ring buffer internal formats */ +	trace_create_file("header_page", 0444, d_events, +			  ring_buffer_print_page_header, +			  &ftrace_show_header_fops); + +	trace_create_file("header_event", 0444, d_events, +			  ring_buffer_print_entry_header, +			  &ftrace_show_header_fops); + +	trace_create_file("enable", 0644, d_events, +			  tr, &ftrace_tr_enable_fops); + +	tr->event_dir = d_events; + +	return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_early_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ +	/* Disable any running events */ +	__ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + +	mutex_lock(&event_mutex); + +	down_write(&trace_event_sem); +	__trace_remove_event_dirs(tr); +	debugfs_remove_recursive(tr->event_dir); +	up_write(&trace_event_sem); + +	tr->event_dir = NULL; + +	mutex_unlock(&event_mutex); + +	return 0; +} + +static __init int event_trace_memsetup(void) +{ +	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); +	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); +	return 0; +} +  static __init int event_trace_enable(void)  { +	struct trace_array *tr = top_trace_array();  	struct ftrace_event_call **iter, *call;  	char *buf = bootup_event_buf;  	char *token; @@ -1486,6 +2367,14 @@ static __init int event_trace_enable(void)  			list_add(&call->list, &ftrace_events);  	} +	/* +	 * We need the top trace array to have a working set of trace +	 * points at early init, before the debug files and directories +	 * are created. Create the file entries now, and attach them +	 * to the actual file dentries later. +	 */ +	__trace_early_add_events(tr); +  	while (true) {  		token = strsep(&buf, ","); @@ -1494,73 +2383,43 @@ static __init int event_trace_enable(void)  		if (!*token)  			continue; -		ret = ftrace_set_clr_event(token, 1); +		ret = ftrace_set_clr_event(tr, token, 1);  		if (ret)  			pr_warn("Failed to enable trace event: %s\n", token);  	}  	trace_printk_start_comm(); +	register_event_cmds(); +  	return 0;  }  static __init int event_trace_init(void)  { -	struct ftrace_event_call *call; +	struct trace_array *tr;  	struct dentry *d_tracer;  	struct dentry *entry; -	struct dentry *d_events;  	int ret; +	tr = top_trace_array(); +  	d_tracer = tracing_init_dentry();  	if (!d_tracer)  		return 0;  	entry = debugfs_create_file("available_events", 0444, d_tracer, -				    NULL, &ftrace_avail_fops); +				    tr, &ftrace_avail_fops);  	if (!entry)  		pr_warning("Could not create debugfs "  			   "'available_events' entry\n"); -	entry = debugfs_create_file("set_event", 0644, d_tracer, -				    NULL, &ftrace_set_event_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_event' entry\n"); - -	d_events = event_trace_events_dir(); -	if (!d_events) -		return 0; - -	/* ring buffer internal formats */ -	trace_create_file("header_page", 0444, d_events, -			  ring_buffer_print_page_header, -			  &ftrace_show_header_fops); - -	trace_create_file("header_event", 0444, d_events, -			  ring_buffer_print_entry_header, -			  &ftrace_show_header_fops); - -	trace_create_file("enable", 0644, d_events, -			  NULL, &ftrace_system_enable_fops); -  	if (trace_define_common_fields())  		pr_warning("tracing: Failed to allocate common fields"); -	/* -	 * Early initialization already enabled ftrace event. -	 * Now it's only necessary to create the event directory. -	 */ -	list_for_each_entry(call, &ftrace_events, list) { - -		ret = event_create_dir(call, d_events, -				       &ftrace_event_id_fops, -				       &ftrace_enable_fops, -				       &ftrace_event_filter_fops, -				       &ftrace_event_format_fops); -		if (ret < 0) -			event_remove(call); -	} +	ret = early_event_add_tracer(d_tracer, tr); +	if (ret) +		return ret;  	ret = register_module_notifier(&trace_module_nb);  	if (ret) @@ -1568,6 +2427,7 @@ static __init int event_trace_init(void)  	return 0;  } +early_initcall(event_trace_memsetup);  core_initcall(event_trace_enable);  fs_initcall(event_trace_init); @@ -1627,13 +2487,20 @@ static __init void event_test_stuff(void)   */  static __init void event_trace_self_tests(void)  { +	struct ftrace_subsystem_dir *dir; +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	struct event_subsystem *system; +	struct trace_array *tr;  	int ret; +	tr = top_trace_array(); +  	pr_info("Running tests on trace events:\n"); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		/* Only test those that have a probe */  		if (!call->class || !call->class->probe) @@ -1657,15 +2524,15 @@ static __init void event_trace_self_tests(void)  		 * If an event is already enabled, someone is using  		 * it and the self test should not be on.  		 */ -		if (call->flags & TRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) {  			pr_warning("Enabled event during self test!\n");  			WARN_ON_ONCE(1);  			continue;  		} -		ftrace_event_enable_disable(call, 1); +		ftrace_event_enable_disable(file, 1);  		event_test_stuff(); -		ftrace_event_enable_disable(call, 0); +		ftrace_event_enable_disable(file, 0);  		pr_cont("OK\n");  	} @@ -1674,7 +2541,9 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on trace event systems:\n"); -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { + +		system = dir->subsystem;  		/* the ftrace system is special, skip it */  		if (strcmp(system->name, "ftrace") == 0) @@ -1682,7 +2551,7 @@ static __init void event_trace_self_tests(void)  		pr_info("Testing event system %s: ", system->name); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error enabling system %s\n",  				   system->name); @@ -1691,7 +2560,7 @@ static __init void event_trace_self_tests(void)  		event_test_stuff(); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error disabling system %s\n",  				   system->name); @@ -1706,7 +2575,7 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on all trace events:\n");  	pr_info("Testing all events: "); -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error enabling all events\n");  		return; @@ -1715,7 +2584,7 @@ static __init void event_trace_self_tests(void)  	event_test_stuff();  	/* reset sysname */ -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error disabling all events\n");  		return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4..a6361178de5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,  	mutex_unlock(&event_mutex);  } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ -	struct ftrace_event_field *field; - -	list_for_each_entry(field, head, link) { -		if (!strcmp(field->name, name)) -			return field; -	} - -	return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ -	struct ftrace_event_field *field; -	struct list_head *head; - -	field = __find_event_field(&ftrace_common_fields, name); -	if (field) -		return field; - -	head = trace_get_fields(call); -	return __find_event_field(head, name); -} -  static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)  {  	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); @@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,  		return NULL;  	} -	field = find_event_field(call, operand1); +	field = trace_find_event_field(call, operand1);  	if (!field) {  		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);  		return NULL; @@ -1907,16 +1880,17 @@ out_unlock:  	return err;  } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  				 char *filter_string)  { +	struct event_subsystem *system = dir->subsystem;  	struct event_filter *filter;  	int err = 0;  	mutex_lock(&event_mutex);  	/* Make sure the system still has events */ -	if (!system->nr_events) { +	if (!dir->nr_events) {  		err = -ENODEV;  		goto out_unlock;  	} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037..d21a7467008 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void)		\  #undef FTRACE_ENTRY  #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ -int									\ +static int __init							\  ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  {									\  	struct struct_name field;					\ @@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\  			 regfn)						\  									\ -struct ftrace_event_class event_class_ftrace_##call = {			\ +struct ftrace_event_class __refdata event_class_ftrace_##call = {	\  	.system			= __stringify(TRACE_SYSTEM),		\  	.define_fields		= ftrace_define_fields_##call,		\  	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 60115252332..c4d6d719198 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);  static int function_trace_init(struct trace_array *tr)  {  	func_trace = tr; -	tr->cpu = get_cpu(); +	tr->trace_buffer.cpu = get_cpu();  	put_cpu();  	tracing_start_cmdline_record(); @@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)  static void function_trace_start(struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  /* Our option */ @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,  		goto out;  	cpu = smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (!atomic_read(&data->disabled)) {  		local_save_flags(flags);  		trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,  	 */  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) { @@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =  };  #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data)  { -	long *count = (long *)data; - -	if (tracing_is_on()) -		return; +	unsigned long *count = (long *)data;  	if (!*count) -		return; +		return 0;  	if (*count != -1)  		(*count)--; -	tracing_on(); +	return 1;  }  static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)  { -	long *count = (long *)data; +	if (tracing_is_on()) +		return; + +	if (update_count(data)) +		tracing_on(); +} +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{  	if (!tracing_is_on())  		return; -	if (!*count) +	if (update_count(data)) +		tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (tracing_is_on())  		return; -	if (*count != -1) -		(*count)--; +	tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return;  	tracing_off();  } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + *   ftrace_stacktrace() + *   function_trace_probe_call() + *   ftrace_ops_list_func() + *   ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { -	.func			= ftrace_traceon, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ +	trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { -	.func			= ftrace_traceoff, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return; + +	if (update_count(data)) +		trace_dump_stack(STACK_SKIP); +}  static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, +		   unsigned long ip, void *data)  {  	long count = (long)data; -	seq_printf(m, "%ps:", (void *)ip); - -	if (ops == &traceon_probe_ops) -		seq_printf(m, "traceon"); -	else -		seq_printf(m, "traceoff"); +	seq_printf(m, "%ps:%s", (void *)ip, name);  	if (count == -1)  		seq_printf(m, ":unlimited\n"); @@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,  }  static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data)  { -	struct ftrace_probe_ops *ops; - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	return ftrace_probe_print("traceon", m, ip, data); +} -	unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("traceoff", m, ip, data); +} -	return 0; +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("stacktrace", m, ip, data);  } +static struct ftrace_probe_ops traceon_count_probe_ops = { +	.func			= ftrace_traceon_count, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { +	.func			= ftrace_traceoff_count, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { +	.func			= ftrace_stacktrace_count, +	.print			= ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { +	.func			= ftrace_traceon, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { +	.func			= ftrace_traceoff, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { +	.func			= ftrace_stacktrace, +	.print			= ftrace_stacktrace_print, +}; +  static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, -			    char *glob, char *cmd, char *param, int enable) +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +			    struct ftrace_hash *hash, char *glob, +			    char *cmd, char *param, int enable)  { -	struct ftrace_probe_ops *ops;  	void *count = (void *)-1;  	char *number;  	int ret; @@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,  	if (!enable)  		return -EINVAL; -	if (glob[0] == '!') -		return ftrace_trace_onoff_unreg(glob+1, cmd, param); - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	}  	if (!param)  		goto out_reg; @@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,  	return ret < 0 ? ret : 0;  } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, +			    char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	/* we register both traceon and traceoff to this callback */ +	if (strcmp(cmd, "traceon") == 0) +		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; +	else +		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} +  static struct ftrace_func_command ftrace_traceon_cmd = {  	.name			= "traceon",  	.func			= ftrace_trace_onoff_callback, @@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {  	.func			= ftrace_trace_onoff_callback,  }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { +	.name			= "stacktrace", +	.func			= ftrace_stacktrace_callback, +}; +  static int __init init_func_cmd_traceon(void)  {  	int ret; @@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)  	ret = register_ftrace_command(&ftrace_traceon_cmd);  	if (ret)  		unregister_ftrace_command(&ftrace_traceoff_cmd); + +	ret = register_ftrace_command(&ftrace_stacktrace_cmd); +	if (ret) { +		unregister_ftrace_command(&ftrace_traceoff_cmd); +		unregister_ftrace_command(&ftrace_traceon_cmd); +	}  	return ret;  }  #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389c..8388bc99f2e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_entry;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ent_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_exit;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ret_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,  			 * We need to consume the current entry to see  			 * the next one.  			 */ -			ring_buffer_consume(iter->tr->buffer, iter->cpu, +			ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,  					    NULL, NULL); -			event = ring_buffer_peek(iter->tr->buffer, iter->cpu, +			event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,  						 NULL, NULL);  		} diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 443b25b43b4..b19d065a28c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -33,6 +33,7 @@ enum {  static int trace_type __read_mostly;  static int save_flags; +static bool function_enabled;  static void stop_irqsoff_tracer(struct trace_array *tr, int graph);  static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,  	if (!irqs_disabled_flags(*flags))  		return 0; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (likely(disabled == 1)) @@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)  		per_cpu(tracing_cpu, cpu) = 0;  	tracing_max_latency = 0; -	tracing_reset_online_cpus(irqsoff_trace); +	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);  	return start_irqsoff_tracer(irqsoff_trace, set);  } @@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (per_cpu(tracing_cpu, cpu))  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) || atomic_read(&data->disabled))  		return; @@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (!tracer_enabled)  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) ||  	    !data->critical_start || atomic_read(&data->disabled)) @@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)  }  #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(int graph, int set)  { -	int ret = 0; +	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&irqsoff_graph_return,  					    &irqsoff_graph_entry); +	else +		ret = register_ftrace_function(&trace_ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_irqsoff_function(int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(&trace_ops); + +	function_enabled = false; +} + +static void irqsoff_function_set(int set) +{ +	if (set) +		register_irqsoff_function(is_graph(), 1); +	else +		unregister_irqsoff_function(is_graph()); +} + +static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +{ +	if (mask & TRACE_ITER_FUNCTION) +		irqsoff_function_set(set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ +	int ret; + +	ret = register_irqsoff_function(graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -550,10 +596,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_irqsoff_function(graph);  }  static void __irqsoff_tracer_init(struct trace_array *tr) @@ -561,14 +604,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)  	save_flags = trace_flags;  	/* non overwrite screws up the latency tracers */ -	set_tracer_flag(TRACE_ITER_OVERWRITE, 1); -	set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);  	tracing_max_latency = 0;  	irqsoff_trace = tr;  	/* make sure that the tracer is visible */  	smp_wmb(); -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	if (start_irqsoff_tracer(tr, is_graph()))  		printk(KERN_ERR "failed to start irqsoff tracer\n"); @@ -581,8 +624,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr)  	stop_irqsoff_tracer(tr, is_graph()); -	set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); -	set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);  }  static void irqsoff_tracer_start(struct trace_array *tr) @@ -615,7 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, -	.flag_changed	= trace_keep_overwrite, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_irqsoff,  #endif @@ -649,7 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, -	.flag_changed	= trace_keep_overwrite, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptoff,  #endif @@ -685,7 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, -	.flag_changed	= trace_keep_overwrite, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptirqsoff,  #endif diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b..bd90e1b0608 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	iter.iter_flags |= TRACE_FILE_LAT_FMT;  	iter.pos = -1; -	if (cpu_file == TRACE_PIPE_ALL_CPU) { +	if (cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter.buffer_iter[cpu] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);  			ring_buffer_read_start(iter.buffer_iter[cpu]);  			tracing_iter_reset(&iter, cpu);  		}  	} else {  		iter.cpu_file = cpu_file;  		iter.buffer_iter[cpu_file] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu_file); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);  		ring_buffer_read_start(iter.buffer_iter[cpu_file]);  		tracing_iter_reset(&iter, cpu_file);  	} @@ -83,7 +83,7 @@ out:  	trace_flags = old_userobj;  	for_each_tracing_cpu(cpu) { -		atomic_dec(&iter.tr->data[cpu]->disabled); +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)  		    !cpu_online(cpu_file))  			return KDB_BADINT;  	} else { -		cpu_file = TRACE_PIPE_ALL_CPU; +		cpu_file = RING_BUFFER_ALL_CPUS;  	}  	kdb_trap_printk++; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1865d5f7653..636d45fe69b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -27,7 +27,6 @@  /**   * Kprobe event core functions   */ -  struct trace_probe {  	struct list_head	list;  	struct kretprobe	rp;	/* Use rp.kp for kprobe use */ @@ -36,6 +35,7 @@ struct trace_probe {  	const char		*symbol;	/* symbol name */  	struct ftrace_event_class	class;  	struct ftrace_event_call	call; +	struct ftrace_event_file	**files;  	ssize_t			size;		/* trace entry size */  	unsigned int		nr_args;  	struct probe_arg	args[]; @@ -46,7 +46,7 @@ struct trace_probe {  	(sizeof(struct probe_arg) * (n))) -static __kprobes int trace_probe_is_return(struct trace_probe *tp) +static __kprobes bool trace_probe_is_return(struct trace_probe *tp)  {  	return tp->rp.handler != NULL;  } @@ -183,12 +183,57 @@ static struct trace_probe *find_trace_probe(const char *event,  	return NULL;  } -/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static int enable_trace_probe(struct trace_probe *tp, int flag) +static int trace_probe_nr_files(struct trace_probe *tp) +{ +	struct ftrace_event_file **file = tp->files; +	int ret = 0; + +	if (file) +		while (*(file++)) +			ret++; + +	return ret; +} + +static DEFINE_MUTEX(probe_enable_lock); + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int +enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)  {  	int ret = 0; -	tp->flags |= flag; +	mutex_lock(&probe_enable_lock); + +	if (file) { +		struct ftrace_event_file **new, **old = tp->files; +		int n = trace_probe_nr_files(tp); + +		/* 1 is for new one and 1 is for stopper */ +		new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), +			      GFP_KERNEL); +		if (!new) { +			ret = -ENOMEM; +			goto out_unlock; +		} +		memcpy(new, old, n * sizeof(struct ftrace_event_file *)); +		new[n] = file; +		/* The last one keeps a NULL */ + +		rcu_assign_pointer(tp->files, new); +		tp->flags |= TP_FLAG_TRACE; + +		if (old) { +			/* Make sure the probe is done with old files */ +			synchronize_sched(); +			kfree(old); +		} +	} else +		tp->flags |= TP_FLAG_PROFILE; +  	if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&  	    !trace_probe_has_gone(tp)) {  		if (trace_probe_is_return(tp)) @@ -197,19 +242,83 @@ static int enable_trace_probe(struct trace_probe *tp, int flag)  			ret = enable_kprobe(&tp->rp.kp);  	} + out_unlock: +	mutex_unlock(&probe_enable_lock); +  	return ret;  } -/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static void disable_trace_probe(struct trace_probe *tp, int flag) +static int +trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +{ +	int i; + +	if (tp->files) { +		for (i = 0; tp->files[i]; i++) +			if (tp->files[i] == file) +				return i; +	} + +	return -1; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int +disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)  { -	tp->flags &= ~flag; +	int ret = 0; + +	mutex_lock(&probe_enable_lock); + +	if (file) { +		struct ftrace_event_file **new, **old = tp->files; +		int n = trace_probe_nr_files(tp); +		int i, j; + +		if (n == 0 || trace_probe_file_index(tp, file) < 0) { +			ret = -EINVAL; +			goto out_unlock; +		} + +		if (n == 1) {	/* Remove the last file */ +			tp->flags &= ~TP_FLAG_TRACE; +			new = NULL; +		} else { +			new = kzalloc(n * sizeof(struct ftrace_event_file *), +				      GFP_KERNEL); +			if (!new) { +				ret = -ENOMEM; +				goto out_unlock; +			} + +			/* This copy & check loop copies the NULL stopper too */ +			for (i = 0, j = 0; j < n && i < n + 1; i++) +				if (old[i] != file) +					new[j++] = old[i]; +		} + +		rcu_assign_pointer(tp->files, new); + +		/* Make sure the probe is done with old files */ +		synchronize_sched(); +		kfree(old); +	} else +		tp->flags &= ~TP_FLAG_PROFILE; +  	if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {  		if (trace_probe_is_return(tp))  			disable_kretprobe(&tp->rp);  		else  			disable_kprobe(&tp->rp.kp);  	} + + out_unlock: +	mutex_unlock(&probe_enable_lock); + +	return ret;  }  /* Internal register function - just handle k*probes and flags */ @@ -723,9 +832,10 @@ static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,  }  /* Kprobe handler */ -static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +static __kprobes void +__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, +		    struct ftrace_event_file *ftrace_file)  { -	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);  	struct kprobe_trace_entry_head *entry;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; @@ -733,7 +843,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)  	unsigned long irq_flags;  	struct ftrace_event_call *call = &tp->call; -	tp->nhit++; +	WARN_ON(call != ftrace_file->event_call); + +	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) +		return;  	local_save_flags(irq_flags);  	pc = preempt_count(); @@ -741,13 +854,14 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)  	dsize = __get_data_size(tp, regs);  	size = sizeof(*entry) + tp->size + dsize; -	event = trace_current_buffer_lock_reserve(&buffer, call->event.type, -						  size, irq_flags, pc); +	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, +						call->event.type, +						size, irq_flags, pc);  	if (!event)  		return;  	entry = ring_buffer_event_data(event); -	entry->ip = (unsigned long)kp->addr; +	entry->ip = (unsigned long)tp->rp.kp.addr;  	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);  	if (!filter_current_check_discard(buffer, call, entry, event)) @@ -755,11 +869,24 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)  						irq_flags, pc, regs);  } +static __kprobes void +kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +{ +	struct ftrace_event_file **file = tp->files; + +	/* Note: preempt is already disabled around the kprobe handler */ +	while (*file) { +		__kprobe_trace_func(tp, regs, *file); +		file++; +	} +} +  /* Kretprobe handler */ -static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, -					  struct pt_regs *regs) +static __kprobes void +__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +		       struct pt_regs *regs, +		       struct ftrace_event_file *ftrace_file)  { -	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);  	struct kretprobe_trace_entry_head *entry;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; @@ -767,14 +894,20 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,  	unsigned long irq_flags;  	struct ftrace_event_call *call = &tp->call; +	WARN_ON(call != ftrace_file->event_call); + +	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) +		return; +  	local_save_flags(irq_flags);  	pc = preempt_count();  	dsize = __get_data_size(tp, regs);  	size = sizeof(*entry) + tp->size + dsize; -	event = trace_current_buffer_lock_reserve(&buffer, call->event.type, -						  size, irq_flags, pc); +	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, +						call->event.type, +						size, irq_flags, pc);  	if (!event)  		return; @@ -788,6 +921,19 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,  						irq_flags, pc, regs);  } +static __kprobes void +kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +		     struct pt_regs *regs) +{ +	struct ftrace_event_file **file = tp->files; + +	/* Note: preempt is already disabled around the kprobe handler */ +	while (*file) { +		__kretprobe_trace_func(tp, ri, regs, *file); +		file++; +	} +} +  /* Event entry printers */  enum print_line_t  print_kprobe_event(struct trace_iterator *iter, int flags, @@ -975,10 +1121,9 @@ static int set_print_fmt(struct trace_probe *tp)  #ifdef CONFIG_PERF_EVENTS  /* Kprobe profile handler */ -static __kprobes void kprobe_perf_func(struct kprobe *kp, -					 struct pt_regs *regs) +static __kprobes void +kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)  { -	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);  	struct ftrace_event_call *call = &tp->call;  	struct kprobe_trace_entry_head *entry;  	struct hlist_head *head; @@ -997,7 +1142,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,  	if (!entry)  		return; -	entry->ip = (unsigned long)kp->addr; +	entry->ip = (unsigned long)tp->rp.kp.addr;  	memset(&entry[1], 0, dsize);  	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); @@ -1007,10 +1152,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,  }  /* Kretprobe profile handler */ -static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, -					    struct pt_regs *regs) +static __kprobes void +kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, +		    struct pt_regs *regs)  { -	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);  	struct ftrace_event_call *call = &tp->call;  	struct kretprobe_trace_entry_head *entry;  	struct hlist_head *head; @@ -1044,20 +1189,19 @@ int kprobe_register(struct ftrace_event_call *event,  		    enum trace_reg type, void *data)  {  	struct trace_probe *tp = (struct trace_probe *)event->data; +	struct ftrace_event_file *file = data;  	switch (type) {  	case TRACE_REG_REGISTER: -		return enable_trace_probe(tp, TP_FLAG_TRACE); +		return enable_trace_probe(tp, file);  	case TRACE_REG_UNREGISTER: -		disable_trace_probe(tp, TP_FLAG_TRACE); -		return 0; +		return disable_trace_probe(tp, file);  #ifdef CONFIG_PERF_EVENTS  	case TRACE_REG_PERF_REGISTER: -		return enable_trace_probe(tp, TP_FLAG_PROFILE); +		return enable_trace_probe(tp, NULL);  	case TRACE_REG_PERF_UNREGISTER: -		disable_trace_probe(tp, TP_FLAG_PROFILE); -		return 0; +		return disable_trace_probe(tp, NULL);  	case TRACE_REG_PERF_OPEN:  	case TRACE_REG_PERF_CLOSE:  	case TRACE_REG_PERF_ADD: @@ -1073,11 +1217,13 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)  {  	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); +	tp->nhit++; +  	if (tp->flags & TP_FLAG_TRACE) -		kprobe_trace_func(kp, regs); +		kprobe_trace_func(tp, regs);  #ifdef CONFIG_PERF_EVENTS  	if (tp->flags & TP_FLAG_PROFILE) -		kprobe_perf_func(kp, regs); +		kprobe_perf_func(tp, regs);  #endif  	return 0;	/* We don't tweek kernel, so just return 0 */  } @@ -1087,11 +1233,13 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)  {  	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); +	tp->nhit++; +  	if (tp->flags & TP_FLAG_TRACE) -		kretprobe_trace_func(ri, regs); +		kretprobe_trace_func(tp, ri, regs);  #ifdef CONFIG_PERF_EVENTS  	if (tp->flags & TP_FLAG_PROFILE) -		kretprobe_perf_func(ri, regs); +		kretprobe_perf_func(tp, ri, regs);  #endif  	return 0;	/* We don't tweek kernel, so just return 0 */  } @@ -1189,11 +1337,24 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,  	return a1 + a2 + a3 + a4 + a5 + a6;  } +static struct ftrace_event_file * +find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	list_for_each_entry(file, &tr->events, list) +		if (file->event_call == &tp->call) +			return file; + +	return NULL; +} +  static __init int kprobe_trace_self_tests_init(void)  {  	int ret, warn = 0;  	int (*target)(int, int, int, int, int, int);  	struct trace_probe *tp; +	struct ftrace_event_file *file;  	target = kprobe_trace_selftest_target; @@ -1203,31 +1364,43 @@ static __init int kprobe_trace_self_tests_init(void)  				  "$stack $stack0 +0($stack)",  				  create_trace_probe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on probing function entry.\n"); +		pr_warn("error on probing function entry.\n");  		warn++;  	} else {  		/* Enable trace point */  		tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);  		if (WARN_ON_ONCE(tp == NULL)) { -			pr_warning("error on getting new probe.\n"); +			pr_warn("error on getting new probe.\n");  			warn++; -		} else -			enable_trace_probe(tp, TP_FLAG_TRACE); +		} else { +			file = find_trace_probe_file(tp, top_trace_array()); +			if (WARN_ON_ONCE(file == NULL)) { +				pr_warn("error on getting probe file.\n"); +				warn++; +			} else +				enable_trace_probe(tp, file); +		}  	}  	ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "  				  "$retval", create_trace_probe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on probing function return.\n"); +		pr_warn("error on probing function return.\n");  		warn++;  	} else {  		/* Enable trace point */  		tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);  		if (WARN_ON_ONCE(tp == NULL)) { -			pr_warning("error on getting new probe.\n"); +			pr_warn("error on getting 2nd new probe.\n");  			warn++; -		} else -			enable_trace_probe(tp, TP_FLAG_TRACE); +		} else { +			file = find_trace_probe_file(tp, top_trace_array()); +			if (WARN_ON_ONCE(file == NULL)) { +				pr_warn("error on getting probe file.\n"); +				warn++; +			} else +				enable_trace_probe(tp, file); +		}  	}  	if (warn) @@ -1238,27 +1411,39 @@ static __init int kprobe_trace_self_tests_init(void)  	/* Disable trace points before removing it */  	tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);  	if (WARN_ON_ONCE(tp == NULL)) { -		pr_warning("error on getting test probe.\n"); +		pr_warn("error on getting test probe.\n");  		warn++; -	} else -		disable_trace_probe(tp, TP_FLAG_TRACE); +	} else { +		file = find_trace_probe_file(tp, top_trace_array()); +		if (WARN_ON_ONCE(file == NULL)) { +			pr_warn("error on getting probe file.\n"); +			warn++; +		} else +			disable_trace_probe(tp, file); +	}  	tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);  	if (WARN_ON_ONCE(tp == NULL)) { -		pr_warning("error on getting 2nd test probe.\n"); +		pr_warn("error on getting 2nd test probe.\n");  		warn++; -	} else -		disable_trace_probe(tp, TP_FLAG_TRACE); +	} else { +		file = find_trace_probe_file(tp, top_trace_array()); +		if (WARN_ON_ONCE(file == NULL)) { +			pr_warn("error on getting probe file.\n"); +			warn++; +		} else +			disable_trace_probe(tp, file); +	}  	ret = traceprobe_command("-:testprobe", create_trace_probe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on deleting a probe.\n"); +		pr_warn("error on deleting a probe.\n");  		warn++;  	}  	ret = traceprobe_command("-:testprobe2", create_trace_probe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on deleting a probe.\n"); +		pr_warn("error on deleting a probe.\n");  		warn++;  	} diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e..a5e8f4878bf 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)  	overrun_detected = false;  	prev_overruns = 0; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  static int mmio_trace_init(struct trace_array *tr) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)  static unsigned long count_overruns(struct trace_iterator *iter)  {  	unsigned long cnt = atomic_xchg(&dropped_count, 0); -	unsigned long over = ring_buffer_overruns(iter->tr->buffer); +	unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);  	if (over > prev_overruns)  		cnt += over - prev_overruns; @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  				struct mmiotrace_rw *rw)  {  	struct ftrace_event_call *call = &event_mmiotrace_rw; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_rw *entry;  	int pc = preempt_count(); @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  void mmio_trace_rw(struct mmiotrace_rw *rw)  {  	struct trace_array *tr = mmio_trace_array; -	struct trace_array_cpu *data = tr->data[smp_processor_id()]; +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_rw(tr, data, rw);  } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  				struct mmiotrace_map *map)  {  	struct ftrace_event_call *call = &event_mmiotrace_map; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_map *entry;  	int pc = preempt_count(); @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)  	struct trace_array_cpu *data;  	preempt_disable(); -	data = tr->data[smp_processor_id()]; +	data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_map(tr, data, map);  	preempt_enable();  } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 697e88d1390..bb922d9ee51 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@  /* must be a power of 2 */  #define EVENT_HASHSIZE	128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem);  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)  	return ret;  } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ +	struct trace_seq *s = &iter->seq; +	struct trace_entry *entry = iter->ent; +	struct bputs_entry *field; +	int ret; + +	trace_assign_type(field, entry); + +	ret = trace_seq_puts(s, field->str); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return TRACE_TYPE_HANDLED; +} +  enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq; @@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  }  EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, +			   struct trace_event *trace_event) +{ +	struct ftrace_event_call *event; +	struct trace_seq *s = &iter->seq; +	struct trace_seq *p = &iter->tmp_seq; +	struct trace_entry *entry; +	int ret; + +	event = container_of(trace_event, struct ftrace_event_call, event); +	entry = iter->ent; + +	if (entry->type != event->event.type) { +		WARN_ON_ONCE(1); +		return TRACE_TYPE_UNHANDLED; +	} + +	trace_seq_init(p); +	ret = trace_seq_printf(s, "%s: ", event->name); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); +  #ifdef CONFIG_KRETPROBES  static inline const char *kretprobed(const char *name)  { @@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)  {  	unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;  	unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; -	unsigned long long abs_ts = iter->ts - iter->tr->time_start; +	unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;  	unsigned long long rel_ts = next_ts - iter->ts;  	struct trace_seq *s = &iter->seq; @@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)  void trace_event_read_lock(void)  { -	down_read(&trace_event_mutex); +	down_read(&trace_event_sem);  }  void trace_event_read_unlock(void)  { -	up_read(&trace_event_mutex); +	up_read(&trace_event_sem);  }  /** @@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)  	unsigned key;  	int ret = 0; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	if (WARN_ON(!event))  		goto out; @@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)  	ret = event->type;   out: -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return ret;  }  EXPORT_SYMBOL_GPL(register_ftrace_event);  /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write.   */  int __unregister_ftrace_event(struct trace_event *event)  { @@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)   */  int unregister_ftrace_event(struct trace_event *event)  { -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__unregister_ftrace_event(event); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return 0;  } @@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {  	.funcs		= &trace_user_stack_funcs,  }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, +		   struct trace_event *event) +{ +	struct trace_entry *entry = iter->ent; +	struct trace_seq *s = &iter->seq; +	struct bputs_entry *field; + +	trace_assign_type(field, entry); + +	if (!seq_print_ip_sym(s, field->ip, flags)) +		goto partial; + +	if (!trace_seq_puts(s, ": ")) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, +		struct trace_event *event) +{ +	struct bputs_entry *field; +	struct trace_seq *s = &iter->seq; + +	trace_assign_type(field, iter->ent); + +	if (!trace_seq_printf(s, ": %lx : ", field->ip)) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { +	.trace		= trace_bputs_print, +	.raw		= trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { +	.type		= TRACE_BPUTS, +	.funcs		= &trace_bputs_funcs, +}; +  /* TRACE_BPRINT */  static enum print_line_t  trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {  	&trace_wake_event,  	&trace_stack_event,  	&trace_user_stack_event, +	&trace_bputs_event,  	&trace_bprint_event,  	&trace_print_event,  	NULL diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492..127a9d8c835 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@  #include "trace.h"  extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t  trace_print_bprintk_msg_only(struct trace_iterator *iter);  extern enum print_line_t  trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);  /* used by module unregistering */  extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem;  #define MAX_MEMHEX_BYTES	8  #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd..4e98e3b257a 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  			   unsigned long flags, int pc)  {  	struct ftrace_event_call *call = &event_context_switch; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	struct ftrace_event_call *call = &event_wakeup;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,  					  sizeof(*entry), flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fde652c9a51..fee77e15d81 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -37,6 +37,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace);  static void wakeup_graph_return(struct ftrace_graph_ret *trace);  static int save_flags; +static bool function_enabled;  #define TRACE_DISPLAY_GRAPH     1 @@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,  	if (cpu != wakeup_current_cpu)  		goto out_enable; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =  };  #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(int graph, int set)  {  	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&wakeup_graph_return,  					    &wakeup_graph_entry); +	else +		ret = register_ftrace_function(&trace_ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_wakeup_function(int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(&trace_ops); + +	function_enabled = false; +} + +static void wakeup_function_set(int set) +{ +	if (set) +		register_wakeup_function(is_graph(), 1); +	else +		unregister_wakeup_function(is_graph()); +} + +static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +{ +	if (mask & TRACE_ITER_FUNCTION) +		wakeup_function_set(set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(int graph) +{ +	int ret; + +	ret = register_wakeup_function(graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_wakeup_function(graph);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,  	/* disable local data, not wakeup_cpu data */  	cpu = raw_smp_processor_id(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (likely(disabled != 1))  		goto out; @@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,  		goto out_unlock;  	/* The task we are waiting for is waking up */ -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);  	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +430,7 @@ out_unlock:  	arch_spin_unlock(&wakeup_lock);  	local_irq_restore(flags);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void __wakeup_reset(struct trace_array *tr) @@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)  {  	unsigned long flags; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	local_irq_save(flags);  	arch_spin_lock(&wakeup_lock); @@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  		return;  	pc = preempt_count(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  	local_save_flags(flags); -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	data->preempt_timestamp = ftrace_now(cpu);  	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  out_locked:  	arch_spin_unlock(&wakeup_lock);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void start_wakeup_tracer(struct trace_array *tr) @@ -543,8 +586,8 @@ static int __wakeup_tracer_init(struct trace_array *tr)  	save_flags = trace_flags;  	/* non overwrite screws up the latency tracers */ -	set_tracer_flag(TRACE_ITER_OVERWRITE, 1); -	set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);  	tracing_max_latency = 0;  	wakeup_trace = tr; @@ -573,8 +616,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)  	/* make sure we put back any tasks we are tracing */  	wakeup_reset(tr); -	set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); -	set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);  }  static void wakeup_tracer_start(struct trace_array *tr) @@ -600,7 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, -	.flag_changed	= trace_keep_overwrite, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif @@ -622,7 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, -	.flag_changed	= trace_keep_overwrite, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c2..55e2cf66967 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)  	return 0;  } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)  {  	struct ring_buffer_event *event;  	struct trace_entry *entry;  	unsigned int loops = 0; -	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { +	while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {  		entry = ring_buffer_event_data(event);  		/* @@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)   * Test the trace buffer to see if all the elements   * are still sane.   */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)  {  	unsigned long flags, cnt = 0;  	int cpu, ret = 0; @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	local_irq_save(flags);  	arch_spin_lock(&ftrace_max_lock); -	cnt = ring_buffer_entries(tr->buffer); +	cnt = ring_buffer_entries(buf->buffer);  	/*  	 * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	 */  	tracing_off();  	for_each_possible_cpu(cpu) { -		ret = trace_test_buffer_cpu(tr, cpu); +		ret = trace_test_buffer_cpu(buf, cpu);  		if (ret)  			break;  	} @@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	msleep(100);  	/* we should have nothing in the buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	if (ret)  		goto out; @@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	tracing_start();  	/* we should only have one item */ @@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  /* Maximum number of functions to trace before diagnosing a hang */  #define GRAPH_MAX_FUNC_TEST	100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);  static unsigned int graph_hang_thresh;  /* Wrap the real function entry probe to avoid possible hanging */ @@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)  	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {  		ftrace_graph_stop();  		printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); -		if (ftrace_dump_on_oops) -			__ftrace_dump(false, DUMP_ALL); +		if (ftrace_dump_on_oops) { +			ftrace_dump(DUMP_ALL); +			/* ftrace_dump() disables tracing */ +			tracing_on(); +		}  		return 0;  	} @@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	 * Simulate the init() callback but we attach a watchdog callback  	 * to detect and recover from possible hangs  	 */ -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	set_graph_array(tr);  	ret = register_ftrace_graph(&trace_graph_return,  				    &trace_graph_entry_watchdog); @@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (ret)  		goto out; @@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (!ret && !count) {  		printk(KERN_CONT ".. no entries found .."); @@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	printk("ret = %d\n", ret);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr); @@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr  	/* stop the tracing. */  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc70..b20428c5efe 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,13 +20,24 @@  #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry		1 +#else +# define fentry		0 +#endif +  static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =  	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };  static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */  static struct stack_trace max_stack_trace = { -	.max_entries		= STACK_TRACE_ENTRIES, -	.entries		= stack_dump_trace, +	.max_entries		= STACK_TRACE_ENTRIES - 1, +	.entries		= &stack_dump_trace[1],  };  static unsigned long max_stack_size; @@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled;  static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void +check_stack(unsigned long ip, unsigned long *stack)  {  	unsigned long this_size, flags;  	unsigned long *p, *top, *start; +	static int tracer_frame; +	int frame_size = ACCESS_ONCE(tracer_frame);  	int i; -	this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); +	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);  	this_size = THREAD_SIZE - this_size; +	/* Remove the frame of the tracer */ +	this_size -= frame_size;  	if (this_size <= max_stack_size)  		return;  	/* we do not handle interrupt stacks yet */ -	if (!object_is_on_stack(&this_size)) +	if (!object_is_on_stack(stack))  		return;  	local_irq_save(flags);  	arch_spin_lock(&max_stack_lock); +	/* In case another CPU set the tracer_frame on us */ +	if (unlikely(!frame_size)) +		this_size -= tracer_frame; +  	/* a race could have already updated it */  	if (this_size <= max_stack_size)  		goto out; @@ -70,10 +90,18 @@ static inline void check_stack(void)  	save_stack_trace(&max_stack_trace);  	/* +	 * Add the passed in ip from the function tracer. +	 * Searching for this on the stack will skip over +	 * most of the overhead from the stack tracer itself. +	 */ +	stack_dump_trace[0] = ip; +	max_stack_trace.nr_entries++; + +	/*  	 * Now find where in the stack these are.  	 */  	i = 0; -	start = &this_size; +	start = stack;  	top = (unsigned long *)  		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -97,6 +125,18 @@ static inline void check_stack(void)  				found = 1;  				/* Start the search from here */  				start = p + 1; +				/* +				 * We do not want to show the overhead +				 * of the stack tracer stack in the +				 * max stack. If we haven't figured +				 * out what that is, then figure it out +				 * now. +				 */ +				if (unlikely(!tracer_frame) && i == 1) { +					tracer_frame = (p - stack) * +						sizeof(unsigned long); +					max_stack_size -= tracer_frame; +				}  			}  		} @@ -113,6 +153,7 @@ static void  stack_trace_call(unsigned long ip, unsigned long parent_ip,  		 struct ftrace_ops *op, struct pt_regs *pt_regs)  { +	unsigned long stack;  	int cpu;  	preempt_disable_notrace(); @@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,  	if (per_cpu(trace_active, cpu)++ != 0)  		goto out; -	check_stack(); +	/* +	 * When fentry is used, the traced function does not get +	 * its stack frame set up, and we lose the parent. +	 * The ip is pretty useless because the function tracer +	 * was called before that function set up its stack frame. +	 * In this case, we use the parent ip. +	 * +	 * By adding the return address of either the parent ip +	 * or the current ip we can disregard most of the stack usage +	 * caused by the stack tracer itself. +	 * +	 * The function tracer always reports the address of where the +	 * mcount call was, but the stack will hold the return address. +	 */ +	if (fentry) +		ip = parent_ip; +	else +		ip += MCOUNT_INSN_SIZE; + +	check_stack(ip, &stack);   out:  	per_cpu(trace_active, cpu)--; @@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {  	.open = stack_trace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, +	.llseek = ftrace_filter_lseek,  	.release = ftrace_regex_release,  }; @@ -371,6 +431,8 @@ static __init int stack_trace_init(void)  	struct dentry *d_tracer;  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0;  	trace_create_file("stack_max_size", 0644, d_tracer,  			&max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e7..847f88a6194 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -307,6 +307,8 @@ static int tracing_stat_init(void)  	struct dentry *d_tracing;  	d_tracing = tracing_init_dentry(); +	if (!d_tracing) +		return 0;  	stat_dir = debugfs_create_dir("trace_stat", d_tracing);  	if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e32105..8f2ac73c7a5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -12,10 +12,6 @@  #include "trace.h"  static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);  static int syscall_enter_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data); @@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name  	/*  	 * Only compare after the "sys" prefix. Archs that use  	 * syscall wrappers may have syscalls symbols aliases prefixed -	 * with "SyS" instead of "sys", leading to an unwanted +	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted  	 * mismatch.  	 */  	return !strcmp(sym + 3, name + 3); @@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)  		kfree(call->print_fmt);  } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_enter trace;  	struct syscall_metadata *meta = call->data; @@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)  	return ret;  } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_exit trace;  	int ret; @@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)  	return ret;  } -static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  { +	struct trace_array *tr = data;  	struct syscall_trace_enter *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_enter_syscalls)) +	if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr); @@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; -	event = trace_current_buffer_lock_reserve(&buffer, +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer,  			sys_data->enter_event->event.type, size, 0, 0);  	if (!event)  		return; @@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  { +	struct trace_array *tr = data;  	struct syscall_trace_exit *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_exit_syscalls)) +	if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr);  	if (!sys_data)  		return; -	event = trace_current_buffer_lock_reserve(&buffer, +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer,  			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);  	if (!event)  		return; @@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  		trace_current_buffer_unlock_commit(buffer, event, 0, 0);  } -static int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, +				   struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num; @@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_enter) -		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); +	if (!tr->sys_refcount_enter) +		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);  	if (!ret) { -		set_bit(num, enabled_enter_syscalls); -		sys_refcount_enter++; +		set_bit(num, tr->enabled_enter_syscalls); +		tr->sys_refcount_enter++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -static void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, +				      struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr;  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_enter--; -	clear_bit(num, enabled_enter_syscalls); -	if (!sys_refcount_enter) -		unregister_trace_sys_enter(ftrace_syscall_enter, NULL); +	tr->sys_refcount_enter--; +	clear_bit(num, tr->enabled_enter_syscalls); +	if (!tr->sys_refcount_enter) +		unregister_trace_sys_enter(ftrace_syscall_enter, tr);  	mutex_unlock(&syscall_trace_lock);  } -static int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, +				  struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num; @@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_exit) -		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); +	if (!tr->sys_refcount_exit) +		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);  	if (!ret) { -		set_bit(num, enabled_exit_syscalls); -		sys_refcount_exit++; +		set_bit(num, tr->enabled_exit_syscalls); +		tr->sys_refcount_exit++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -static void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, +				     struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr;  	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_exit--; -	clear_bit(num, enabled_exit_syscalls); -	if (!sys_refcount_exit) -		unregister_trace_sys_exit(ftrace_syscall_exit, NULL); +	tr->sys_refcount_exit--; +	clear_bit(num, tr->enabled_exit_syscalls); +	if (!tr->sys_refcount_exit) +		unregister_trace_sys_exit(ftrace_syscall_exit, tr);  	mutex_unlock(&syscall_trace_lock);  } @@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {  	.trace		= print_syscall_exit,  }; -struct ftrace_event_class event_class_syscall_enter = { +struct ftrace_event_class __refdata event_class_syscall_enter = {  	.system		= "syscalls",  	.reg		= syscall_enter_register,  	.define_fields	= syscall_enter_define_fields, @@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {  	.raw_init	= init_syscall_trace,  }; -struct ftrace_event_class event_class_syscall_exit = { +struct ftrace_event_class __refdata event_class_syscall_exit = {  	.system		= "syscalls",  	.reg		= syscall_exit_register,  	.define_fields	= syscall_exit_define_fields, @@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)  static int syscall_enter_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_enter(event); +		return reg_event_syscall_enter(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_enter(event); +		unreg_event_syscall_enter(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,  static int syscall_exit_register(struct ftrace_event_call *event,  				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_exit(event); +		return reg_event_syscall_exit(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_exit(event); +		unreg_event_syscall_exit(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8dad2a92dee..32494fb0ee6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,18 @@  #define UPROBE_EVENT_SYSTEM	"uprobes" +struct uprobe_trace_entry_head { +	struct trace_entry	ent; +	unsigned long		vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return)			\ +	(sizeof(struct uprobe_trace_entry_head) +	\ +	 sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return)		\ +	((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) +  struct trace_uprobe_filter {  	rwlock_t		rwlock;  	int			nr_systemwide; @@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);  static LIST_HEAD(uprobe_list);  static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, +				unsigned long func, struct pt_regs *regs);  static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)  { @@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)  	return !filter->nr_systemwide && list_empty(&filter->perf_events);  } +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ +	return tu->consumer.ret_handler != NULL; +} +  /*   * Allocate new trace_uprobe and initialize it (including uprobes).   */  static struct trace_uprobe * -alloc_trace_uprobe(const char *group, const char *event, int nargs) +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)  {  	struct trace_uprobe *tu; @@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)  	INIT_LIST_HEAD(&tu->list);  	tu->consumer.handler = uprobe_dispatcher; +	if (is_ret) +		tu->consumer.ret_handler = uretprobe_dispatcher;  	init_trace_uprobe_filter(&tu->filter);  	return tu; @@ -180,7 +201,7 @@ end:  /*   * Argument syntax: - *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] + *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]   *   *  - Remove uprobe: -:[GRP/]EVENT   */ @@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)  	char buf[MAX_EVENT_NAME_LEN];  	struct path path;  	unsigned long offset; -	bool is_delete; +	bool is_delete, is_return;  	int i, ret;  	inode = NULL;  	ret = 0;  	is_delete = false; +	is_return = false;  	event = NULL;  	group = NULL;  	/* argc must be >= 1 */  	if (argv[0][0] == '-')  		is_delete = true; +	else if (argv[0][0] == 'r') +		is_return = true;  	else if (argv[0][0] != 'p') { -		pr_info("Probe definition must be started with 'p' or '-'.\n"); +		pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");  		return -EINVAL;  	} @@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)  		kfree(tail);  	} -	tu = alloc_trace_uprobe(group, event, argc); +	tu = alloc_trace_uprobe(group, event, argc, is_return);  	if (IS_ERR(tu)) {  		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));  		ret = PTR_ERR(tu); @@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)  static int probes_seq_show(struct seq_file *m, void *v)  {  	struct trace_uprobe *tu = v; +	char c = is_ret_probe(tu) ? 'r' : 'p';  	int i; -	seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); +	seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);  	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);  	for (i = 0; i < tu->nr_args; i++) @@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {  	.release	= seq_release,  }; -/* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_trace_print(struct trace_uprobe *tu, +				unsigned long func, struct pt_regs *regs)  {  	struct uprobe_trace_entry_head *entry;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; -	u8 *data; -	int size, i, pc; -	unsigned long irq_flags; +	void *data; +	int size, i;  	struct ftrace_event_call *call = &tu->call; -	local_save_flags(irq_flags); -	pc = preempt_count(); - -	size = sizeof(*entry) + tu->size; - +	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));  	event = trace_current_buffer_lock_reserve(&buffer, call->event.type, -						  size, irq_flags, pc); +						  size + tu->size, 0, 0);  	if (!event) -		return 0; +		return;  	entry = ring_buffer_event_data(event); -	entry->ip = instruction_pointer(task_pt_regs(current)); -	data = (u8 *)&entry[1]; +	if (is_ret_probe(tu)) { +		entry->vaddr[0] = func; +		entry->vaddr[1] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		entry->vaddr[0] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, false); +	} +  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);  	if (!filter_current_check_discard(buffer, call, entry, event)) -		trace_buffer_unlock_commit(buffer, event, irq_flags, pc); +		trace_buffer_unlock_commit(buffer, event, 0, 0); +} +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ +	if (!is_ret_probe(tu)) +		uprobe_trace_print(tu, 0, regs);  	return 0;  } +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, +				struct pt_regs *regs) +{ +	uprobe_trace_print(tu, func, regs); +} +  /* Event entry printers */  static enum print_line_t  print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)  { -	struct uprobe_trace_entry_head *field; +	struct uprobe_trace_entry_head *entry;  	struct trace_seq *s = &iter->seq;  	struct trace_uprobe *tu;  	u8 *data;  	int i; -	field = (struct uprobe_trace_entry_head *)iter->ent; +	entry = (struct uprobe_trace_entry_head *)iter->ent;  	tu = container_of(event, struct trace_uprobe, call.event); -	if (!trace_seq_printf(s, "%s: (", tu->call.name)) -		goto partial; - -	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) -		goto partial; - -	if (!trace_seq_puts(s, ")")) -		goto partial; +	if (is_ret_probe(tu)) { +		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, +					entry->vaddr[1], entry->vaddr[0])) +			goto partial; +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, +					entry->vaddr[0])) +			goto partial; +		data = DATAOF_TRACE_ENTRY(entry, false); +	} -	data = (u8 *)&field[1];  	for (i = 0; i < tu->nr_args; i++) {  		if (!tu->args[i].type->print(s, tu->args[i].name, -					     data + tu->args[i].offset, field)) +					     data + tu->args[i].offset, entry))  			goto partial;  	} @@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)  static int uprobe_event_define_fields(struct ftrace_event_call *event_call)  { -	int ret, i; +	int ret, i, size;  	struct uprobe_trace_entry_head field; -	struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; +	struct trace_uprobe *tu = event_call->data; -	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); +	if (is_ret_probe(tu)) { +		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); +		DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); +		size = SIZEOF_TRACE_ENTRY(true); +	} else { +		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); +		size = SIZEOF_TRACE_ENTRY(false); +	}  	/* Set argument names as fields */  	for (i = 0; i < tu->nr_args; i++) {  		ret = trace_define_field(event_call, tu->args[i].type->fmttype,  					 tu->args[i].name, -					 sizeof(field) + tu->args[i].offset, +					 size + tu->args[i].offset,  					 tu->args[i].type->size,  					 tu->args[i].type->is_signed,  					 FILTER_OTHER); @@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)  	int i;  	int pos = 0; -	fmt = "(%lx)"; -	arg = "REC->" FIELD_STRING_IP; +	if (is_ret_probe(tu)) { +		fmt = "(%lx <- %lx)"; +		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; +	} else { +		fmt = "(%lx)"; +		arg = "REC->" FIELD_STRING_IP; +	}  	/* When len=0, we just calculate the needed length */ @@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,  	return ret;  } -/* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_perf_print(struct trace_uprobe *tu, +				unsigned long func, struct pt_regs *regs)  {  	struct ftrace_event_call *call = &tu->call;  	struct uprobe_trace_entry_head *entry;  	struct hlist_head *head; -	u8 *data; -	int size, __size, i; -	int rctx; +	void *data; +	int size, rctx, i; -	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) -		return UPROBE_HANDLER_REMOVE; - -	__size = sizeof(*entry) + tu->size; -	size = ALIGN(__size + sizeof(u32), sizeof(u64)); -	size -= sizeof(u32); +	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); +	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);  	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) -		return 0; +		return;  	preempt_disable(); +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		goto out;  	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);  	if (!entry)  		goto out; -	entry->ip = instruction_pointer(task_pt_regs(current)); -	data = (u8 *)&entry[1]; +	if (is_ret_probe(tu)) { +		entry->vaddr[0] = func; +		entry->vaddr[1] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		entry->vaddr[0] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, false); +	} +  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -	head = this_cpu_ptr(call->perf_events); -	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); - +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);   out:  	preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ +	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) +		return UPROBE_HANDLER_REMOVE; + +	if (!is_ret_probe(tu)) +		uprobe_perf_print(tu, 0, regs);  	return 0;  } + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, +				struct pt_regs *regs) +{ +	uprobe_perf_print(tu, func, regs); +}  #endif	/* CONFIG_PERF_EVENTS */  static  int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)  { -	struct trace_uprobe *tu = (struct trace_uprobe *)event->data; +	struct trace_uprobe *tu = event->data;  	switch (type) {  	case TRACE_REG_REGISTER: @@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)  	return ret;  } +static int uretprobe_dispatcher(struct uprobe_consumer *con, +				unsigned long func, struct pt_regs *regs) +{ +	struct trace_uprobe *tu; + +	tu = container_of(con, struct trace_uprobe, consumer); + +	if (tu->flags & TP_FLAG_TRACE) +		uretprobe_trace_func(tu, func, regs); + +#ifdef CONFIG_PERF_EVENTS +	if (tu->flags & TP_FLAG_PROFILE) +		uretprobe_perf_func(tu, func, regs); +#endif +	return 0; +} +  static struct trace_event_functions uprobe_funcs = {  	.trace		= print_uprobe_event  }; | 
