diff options
Diffstat (limited to 'kernel/trace')
42 files changed, 18615 insertions, 5597 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e04b8bcdef8..d4409356f40 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE  	help  	  See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS +	bool +  config HAVE_FTRACE_MCOUNT_RECORD  	bool  	help @@ -49,6 +52,11 @@ config HAVE_SYSCALL_TRACEPOINTS  	help  	  See Documentation/trace/ftrace-design.txt +config HAVE_FENTRY +	bool +	help +	  Arch supports the gcc options -pg with -mfentry +  config HAVE_C_RECORDMCOUNT  	bool  	help @@ -57,8 +65,13 @@ config HAVE_C_RECORDMCOUNT  config TRACER_MAX_TRACE  	bool +config TRACE_CLOCK +	bool +  config RING_BUFFER  	bool +	select TRACE_CLOCK +	select IRQ_WORK  config FTRACE_NMI_ENTER         bool @@ -94,6 +107,7 @@ config TRACING  	select NOP_TRACER  	select BINARY_PRINTF  	select EVENT_TRACING +	select TRACE_CLOCK  config GENERIC_TRACER  	bool @@ -126,7 +140,6 @@ if FTRACE  config FUNCTION_TRACER  	bool "Kernel Function Tracer"  	depends on HAVE_FUNCTION_TRACER -	select FRAME_POINTER if (!ARM_UNWIND)  	select KALLSYMS  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER @@ -163,6 +176,8 @@ config IRQSOFF_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in irqs-off critical  	  sections, with microsecond accuracy. @@ -185,6 +200,8 @@ config PREEMPT_TRACER  	select GENERIC_TRACER  	select TRACER_MAX_TRACE  	select RING_BUFFER_ALLOW_SWAP +	select TRACER_SNAPSHOT +	select TRACER_SNAPSHOT_PER_CPU_SWAP  	help  	  This option measures the time spent in preemption-off critical  	  sections, with microsecond accuracy. @@ -204,6 +221,7 @@ config SCHED_TRACER  	select GENERIC_TRACER  	select CONTEXT_SWITCH_TRACER  	select TRACER_MAX_TRACE +	select TRACER_SNAPSHOT  	help  	  This tracer tracks the latency of the highest priority task  	  to be scheduled in, starting from the point it has woken up. @@ -225,6 +243,37 @@ config FTRACE_SYSCALLS  	help  	  Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT +	bool "Create a snapshot trace buffer" +	select TRACER_MAX_TRACE +	help +	  Allow tracing users to take snapshot of the current buffer using the +	  ftrace interface, e.g.: + +	      echo 1 > /sys/kernel/debug/tracing/snapshot +	      cat snapshot + +config TRACER_SNAPSHOT_PER_CPU_SWAP +        bool "Allow snapshot to swap per CPU" +	depends on TRACER_SNAPSHOT +	select RING_BUFFER_ALLOW_SWAP +	help +	  Allow doing a snapshot of a single CPU buffer instead of a +	  full swap (all buffers). If this is set, then the following is +	  allowed: + +	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + +	  After which, only the tracing buffer for CPU 2 was swapped with +	  the main tracing buffer, and the other CPU buffers remain the same. + +	  When this is enabled, this adds a little more overhead to the +	  trace recording, as it needs to add some checks to synchronize +	  recording with swaps. But this does not affect the performance +	  of the overall system. This is enabled by default when the preempt +	  or irq latency tracers are enabled, as those need to swap as well +	  and already adds the overhead (plus a lot more). +  config TRACE_BRANCH_PROFILING  	bool  	select GENERIC_TRACER @@ -257,10 +306,10 @@ config PROFILE_ANNOTATED_BRANCHES  	bool "Trace likely/unlikely profiler"  	select TRACE_BRANCH_PROFILING  	help -	  This tracer profiles all the the likely and unlikely macros +	  This tracer profiles all likely and unlikely macros  	  in the kernel. It will display the results in: -	  /sys/kernel/debug/tracing/profile_annotated_branch +	  /sys/kernel/debug/tracing/trace_stat/branch_annotated  	  Note: this will add a significant overhead; only turn this  	  on if you need to profile the system's use of these macros. @@ -273,7 +322,7 @@ config PROFILE_ALL_BRANCHES  	  taken in the kernel is recorded whether it hit or miss.  	  The results will be displayed in: -	  /sys/kernel/debug/tracing/profile_branch +	  /sys/kernel/debug/tracing/trace_stat/branch_all  	  This option also enables the likely/unlikely profiler. @@ -358,6 +407,7 @@ config KPROBE_EVENT  	depends on HAVE_REGS_AND_STACK_ACCESS_API  	bool "Enable kprobes-based dynamic events"  	select TRACING +	select PROBE_EVENTS  	default y  	help  	  This allows the user to add tracing events (similar to tracepoints) @@ -370,24 +420,53 @@ config KPROBE_EVENT  	  This option is also required by perf-probe subcommand of perf tools.  	  If you want to use perf tools, this option is strongly recommended. +config UPROBE_EVENT +	bool "Enable uprobes-based dynamic events" +	depends on ARCH_SUPPORTS_UPROBES +	depends on MMU +	depends on PERF_EVENTS +	select UPROBES +	select PROBE_EVENTS +	select TRACING +	default n +	help +	  This allows the user to add tracing events on top of userspace +	  dynamic events (similar to tracepoints) on the fly via the trace +	  events interface. Those events can be inserted wherever uprobes +	  can probe, and record various registers. +	  This option is required if you plan to use perf-probe subcommand +	  of perf tools on user space applications. + +config PROBE_EVENTS +	def_bool n +  config DYNAMIC_FTRACE -	bool "enable/disable ftrace tracepoints dynamically" +	bool "enable/disable function tracing dynamically"  	depends on FUNCTION_TRACER  	depends on HAVE_DYNAMIC_FTRACE  	default y  	help -          This option will modify all the calls to ftrace dynamically -	  (will patch them out of the binary image and replace them -	  with a No-Op instruction) as they are called. A table is -	  created to dynamically enable them again. +	  This option will modify all the calls to function tracing +	  dynamically (will patch them out of the binary image and +	  replace them with a No-Op instruction) on boot up. During +	  compile time, a table is made of all the locations that ftrace +	  can function trace, and this table is linked into the kernel +	  image. When this is enabled, functions can be individually +	  enabled, and the functions not enabled will not affect +	  performance of the system. + +	  See the files in /sys/kernel/debug/tracing: +	    available_filter_functions +	    set_ftrace_filter +	    set_ftrace_notrace  	  This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but  	  otherwise has native performance as long as no tracing is active. -	  The changes to the code are done by a kernel thread that -	  wakes up once a second and checks to see if any ftrace calls -	  were made. If so, it runs stop_machine (stops all CPUS) -	  and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS +	def_bool y +	depends on DYNAMIC_FTRACE +	depends on HAVE_DYNAMIC_FTRACE_WITH_REGS  config FUNCTION_PROFILER  	bool "Kernel function profiler" @@ -456,6 +535,36 @@ config MMIOTRACE_TEST  	  Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK +        bool "Add tracepoint that benchmarks tracepoints" +	help +	 This option creates the tracepoint "benchmark:benchmark_event". +	 When the tracepoint is enabled, it kicks off a kernel thread that +	 goes into an infinite loop (calling cond_sched() to let other tasks +	 run), and calls the tracepoint. Each iteration will record the time +	 it took to write to the tracepoint and the next iteration that +	 data will be passed to the tracepoint itself. That is, the tracepoint +	 will report the time it took to do the previous tracepoint. +	 The string written to the tracepoint is a static string of 128 bytes +	 to keep the time the same. The initial string is simply a write of +	 "START". The second string records the cold cache time of the first +	 write which is not added to the rest of the calculations. + +	 As it is a tight loop, it benchmarks as hot cache. That's fine because +	 we care most about hot paths that are probably in cache already. + +	 An example of the output: + +	      START +	      first=3672 [COLD CACHED] +	      last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 +	      last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 +	      last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 +	      last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 +	      last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 +	      last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + +  config RING_BUFFER_BENCHMARK  	tristate "Ring buffer benchmark stress tester"  	depends on RING_BUFFER @@ -472,6 +581,29 @@ config RING_BUFFER_BENCHMARK  	  If unsure, say N. +config RING_BUFFER_STARTUP_TEST +       bool "Ring buffer startup self test" +       depends on RING_BUFFER +       help +         Run a simple self test on the ring buffer on boot up. Late in the +	 kernel boot sequence, the test will start that kicks off +	 a thread per cpu. Each thread will write various size events +	 into the ring buffer. Another thread is created to send IPIs +	 to each of the threads, where the IPI handler will also write +	 to the ring buffer, to test/stress the nesting ability. +	 If any anomalies are discovered, a warning will be displayed +	 and all ring buffers will be disabled. + +	 The test runs for 10 seconds. This will slow your boot time +	 by at least 10 more seconds. + +	 At the end of the test, statics and more checks are done. +	 It will output the stats of each per cpu buffer. What +	 was written, the sizes, what was read, what was lost, and +	 other similar details. + +	 If unsure, say N +  endif # FTRACE  endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b2..2611613f14f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -5,21 +5,22 @@ ifdef CONFIG_FUNCTION_TRACER  ORIG_CFLAGS := $(KBUILD_CFLAGS)  KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +ifdef CONFIG_FTRACE_SELFTEST  # selftest needs instrumentation  CFLAGS_trace_selftest_dynamic.o = -pg  obj-y += trace_selftest_dynamic.o  endif +endif  # If unlikely tracing is enabled, do not trace these files  ifdef CONFIG_TRACING_BRANCHES  KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING  endif -# -# Make the trace clocks available generally: it's infrastructure -# relied on by ptrace for example: -# -obj-y += trace_clock.o +CFLAGS_trace_benchmark.o := -I$(src) +CFLAGS_trace_events_filter.o := -I$(src) + +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o  obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o  obj-$(CONFIG_RING_BUFFER) += ring_buffer.o @@ -39,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o  obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o  obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o  obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o  ifeq ($(CONFIG_BLOCK),y)  obj-$(CONFIG_EVENT_TRACING) += blktrace.o @@ -51,10 +51,18 @@ ifeq ($(CONFIG_PERF_EVENTS),y)  obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o  endif  obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o  obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_EVENT_TRACING) += power-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_PM_RUNTIME),y) +obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o +endif  ifeq ($(CONFIG_TRACING),y)  obj-$(CONFIG_KGDB_KDB) += trace_kdb.o  endif +obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o + +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o  libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec028154..c1bd4ada2a0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,8 +23,10 @@  #include <linux/mutex.h>  #include <linux/slab.h>  #include <linux/debugfs.h> +#include <linux/export.h>  #include <linux/time.h>  #include <linux/uaccess.h> +#include <linux/list.h>  #include <trace/events/block.h> @@ -37,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;  static struct trace_array *blk_tr;  static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); +  /* Select an alternative, minimalistic output than the original one */  #define TRACE_BLK_OPT_CLASSIC	0x1 @@ -71,7 +76,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,  	bool blk_tracer = blk_tracer_enabled;  	if (blk_tracer) { -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + len, @@ -106,10 +111,18 @@ record_it:   * Send out a notify for this process, if we haven't done so since a trace   * started   */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk)  { +	unsigned long flags; +	struct blk_trace *bt; +  	tsk->btrace_seq = blktrace_seq; -	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); +	spin_lock_irqsave(&running_trace_lock, flags); +	list_for_each_entry(bt, &running_trace_list, running_list) { +		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, +			   sizeof(tsk->comm)); +	} +	spin_unlock_irqrestore(&running_trace_lock, flags);  }  static void trace_note_time(struct blk_trace *bt) @@ -138,8 +151,15 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)  		     !blk_tracer_enabled))  		return; +	/* +	 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note +	 * message to the trace. +	 */ +	if (!(bt->act_mask & BLK_TC_NOTIFY)) +		return; +  	local_irq_save(flags); -	buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); +	buf = this_cpu_ptr(bt->msg_data);  	va_start(args, fmt);  	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);  	va_end(args); @@ -199,6 +219,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	what |= MASK_TC_BIT(rw, RAHEAD);  	what |= MASK_TC_BIT(rw, META);  	what |= MASK_TC_BIT(rw, DISCARD); +	what |= MASK_TC_BIT(rw, FLUSH); +	what |= MASK_TC_BIT(rw, FUA);  	pid = tsk->pid;  	if (act_log_check(bt, what, sector, pid)) @@ -208,7 +230,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  	if (blk_tracer) {  		tracing_record_cmdline(current); -		buffer = blk_tr->buffer; +		buffer = blk_tr->trace_buffer.buffer;  		pc = preempt_count();  		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,  						  sizeof(*t) + pdu_len, @@ -219,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  		goto record_it;  	} +	if (unlikely(tsk->btrace_seq != blktrace_seq)) +		trace_note_tsk(tsk); +  	/*  	 * A word about the locking here - we disable interrupts to reserve  	 * some space in the relay per-cpu buffer, to prevent an irq  	 * from coming in and stepping on our toes.  	 */  	local_irq_save(flags); - -	if (unlikely(tsk->btrace_seq != blktrace_seq)) -		trace_note_tsk(bt, tsk); -  	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);  	if (t) {  		sequence = per_cpu_ptr(bt->sequence, cpu); @@ -301,13 +322,6 @@ int blk_trace_remove(struct request_queue *q)  }  EXPORT_SYMBOL_GPL(blk_trace_remove); -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ -	filp->private_data = inode->i_private; - -	return 0; -} -  static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,  				size_t count, loff_t *ppos)  { @@ -321,18 +335,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,  static const struct file_operations blk_dropped_fops = {  	.owner =	THIS_MODULE, -	.open =		blk_dropped_open, +	.open =		simple_open,  	.read =		blk_dropped_read,  	.llseek =	default_llseek,  }; -static int blk_msg_open(struct inode *inode, struct file *filp) -{ -	filp->private_data = inode->i_private; - -	return 0; -} -  static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,  				size_t count, loff_t *ppos)  { @@ -361,7 +368,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,  static const struct file_operations blk_msg_fops = {  	.owner =	THIS_MODULE, -	.open =		blk_msg_open, +	.open =		simple_open,  	.write =	blk_msg_write,  	.llseek =	noop_llseek,  }; @@ -392,7 +399,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)  static struct dentry *blk_create_buf_file_callback(const char *filename,  						   struct dentry *parent, -						   int mode, +						   umode_t mode,  						   struct rchan_buf *buf,  						   int *is_global)  { @@ -481,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,  	bt->dir = dir;  	bt->dev = dev;  	atomic_set(&bt->dropped, 0); +	INIT_LIST_HEAD(&bt->running_list);  	ret = -EIO;  	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -571,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,  		.end_lba = cbuts.end_lba,  		.pid = cbuts.pid,  	}; -	memcpy(&buts.name, &cbuts.name, 32);  	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);  	if (ret)  		return ret; -	if (copy_to_user(arg, &buts.name, 32)) { +	if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {  		blk_trace_remove(q);  		return -EFAULT;  	} @@ -605,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)  			blktrace_seq++;  			smp_mb();  			bt->trace_state = Blktrace_running; +			spin_lock_irq(&running_trace_lock); +			list_add(&bt->running_list, &running_trace_list); +			spin_unlock_irq(&running_trace_lock);  			trace_note_time(bt);  			ret = 0; @@ -612,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)  	} else {  		if (bt->trace_state == Blktrace_running) {  			bt->trace_state = Blktrace_stopped; +			spin_lock_irq(&running_trace_lock); +			list_del_init(&bt->running_list); +			spin_unlock_irq(&running_trace_lock);  			relay_flush(bt->rchan);  			ret = 0;  		} @@ -689,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q)   * blk_add_trace_rq - Add a trace for a request oriented action   * @q:		queue the io is for   * @rq:		the source request + * @nr_bytes:	number of completed bytes   * @what:	the action   *   * Description: @@ -696,61 +710,55 @@ void blk_trace_shutdown(struct request_queue *q)   *   **/  static void blk_add_trace_rq(struct request_queue *q, struct request *rq, -				    u32 what) +			     unsigned int nr_bytes, u32 what)  {  	struct blk_trace *bt = q->blk_trace; -	int rw = rq->cmd_flags & 0x03;  	if (likely(!bt))  		return; -	if (rq->cmd_flags & REQ_DISCARD) -		rw |= REQ_DISCARD; - -	if (rq->cmd_flags & REQ_SECURE) -		rw |= REQ_SECURE; -  	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {  		what |= BLK_TC_ACT(BLK_TC_PC); -		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, +		__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,  				what, rq->errors, rq->cmd_len, rq->cmd);  	} else  {  		what |= BLK_TC_ACT(BLK_TC_FS); -		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, -				what, rq->errors, 0, NULL); +		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, +				rq->cmd_flags, what, rq->errors, 0, NULL);  	}  }  static void blk_add_trace_rq_abort(void *ignore,  				   struct request_queue *q, struct request *rq)  { -	blk_add_trace_rq(q, rq, BLK_TA_ABORT); +	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);  }  static void blk_add_trace_rq_insert(void *ignore,  				    struct request_queue *q, struct request *rq)  { -	blk_add_trace_rq(q, rq, BLK_TA_INSERT); +	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);  }  static void blk_add_trace_rq_issue(void *ignore,  				   struct request_queue *q, struct request *rq)  { -	blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);  }  static void blk_add_trace_rq_requeue(void *ignore,  				     struct request_queue *q,  				     struct request *rq)  { -	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);  }  static void blk_add_trace_rq_complete(void *ignore,  				      struct request_queue *q, -				      struct request *rq) +				      struct request *rq, +				      unsigned int nr_bytes)  { -	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +	blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);  }  /** @@ -758,53 +766,60 @@ static void blk_add_trace_rq_complete(void *ignore,   * @q:		queue the io is for   * @bio:	the source bio   * @what:	the action + * @error:	error, if any   *   * Description:   *     Records an action against a bio. Will log the bio offset + size.   *   **/  static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, -				     u32 what) +			      u32 what, int error)  {  	struct blk_trace *bt = q->blk_trace;  	if (likely(!bt))  		return; -	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, -			!bio_flagged(bio, BIO_UPTODATE), 0, NULL); +	if (!error && !bio_flagged(bio, BIO_UPTODATE)) +		error = EIO; + +	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, +			bio->bi_rw, what, error, 0, NULL);  }  static void blk_add_trace_bio_bounce(void *ignore,  				     struct request_queue *q, struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); +	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);  }  static void blk_add_trace_bio_complete(void *ignore, -				       struct request_queue *q, struct bio *bio) +				       struct request_queue *q, struct bio *bio, +				       int error)  { -	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); +	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);  }  static void blk_add_trace_bio_backmerge(void *ignore,  					struct request_queue *q, +					struct request *rq,  					struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); +	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);  }  static void blk_add_trace_bio_frontmerge(void *ignore,  					 struct request_queue *q, +					 struct request *rq,  					 struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); +	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);  }  static void blk_add_trace_bio_queue(void *ignore,  				    struct request_queue *q, struct bio *bio)  { -	blk_add_trace_bio(q, bio, BLK_TA_QUEUE); +	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);  }  static void blk_add_trace_getrq(void *ignore, @@ -812,7 +827,7 @@ static void blk_add_trace_getrq(void *ignore,  				struct bio *bio, int rw)  {  	if (bio) -		blk_add_trace_bio(q, bio, BLK_TA_GETRQ); +		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);  	else {  		struct blk_trace *bt = q->blk_trace; @@ -827,7 +842,7 @@ static void blk_add_trace_sleeprq(void *ignore,  				  struct bio *bio, int rw)  {  	if (bio) -		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); +		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);  	else {  		struct blk_trace *bt = q->blk_trace; @@ -845,29 +860,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)  		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);  } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, +				    unsigned int depth, bool explicit)  {  	struct blk_trace *bt = q->blk_trace;  	if (bt) { -		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; -		__be64 rpdu = cpu_to_be64(pdu); +		__be64 rpdu = cpu_to_be64(depth); +		u32 what; -		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, -				sizeof(rpdu), &rpdu); -	} -} - -static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) -{ -	struct blk_trace *bt = q->blk_trace; - -	if (bt) { -		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; -		__be64 rpdu = cpu_to_be64(pdu); +		if (explicit) +			what = BLK_TA_UNPLUG_IO; +		else +			what = BLK_TA_UNPLUG_TIMER; -		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, -				sizeof(rpdu), &rpdu); +		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);  	}  } @@ -880,14 +887,15 @@ static void blk_add_trace_split(void *ignore,  	if (bt) {  		__be64 rpdu = cpu_to_be64(pdu); -		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, -				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), +		__blk_add_trace(bt, bio->bi_iter.bi_sector, +				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, +				!bio_flagged(bio, BIO_UPTODATE),  				sizeof(rpdu), &rpdu);  	}  }  /** - * blk_add_trace_remap - Add a trace for a remap operation + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation   * @ignore:	trace callback data parameter (not used)   * @q:		queue the io is for   * @bio:	the source bio @@ -899,9 +907,9 @@ static void blk_add_trace_split(void *ignore,   *     it spans a stripe (or similar). Add a trace for that action.   *   **/ -static void blk_add_trace_remap(void *ignore, -				struct request_queue *q, struct bio *bio, -				dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, +				    struct request_queue *q, struct bio *bio, +				    dev_t dev, sector_t from)  {  	struct blk_trace *bt = q->blk_trace;  	struct blk_io_trace_remap r; @@ -913,9 +921,9 @@ static void blk_add_trace_remap(void *ignore,  	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);  	r.sector_from = cpu_to_be64(from); -	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, -			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), -			sizeof(r), &r); +	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, +			bio->bi_rw, BLK_TA_REMAP, +			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);  }  /** @@ -1010,13 +1018,11 @@ static void blk_register_tracepoints(void)  	WARN_ON(ret);  	ret = register_trace_block_plug(blk_add_trace_plug, NULL);  	WARN_ON(ret); -	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); -	WARN_ON(ret); -	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); +	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);  	WARN_ON(ret);  	ret = register_trace_block_split(blk_add_trace_split, NULL);  	WARN_ON(ret); -	ret = register_trace_block_remap(blk_add_trace_remap, NULL); +	ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);  	WARN_ON(ret);  	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);  	WARN_ON(ret); @@ -1025,10 +1031,9 @@ static void blk_register_tracepoints(void)  static void blk_unregister_tracepoints(void)  {  	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); -	unregister_trace_block_remap(blk_add_trace_remap, NULL); +	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);  	unregister_trace_block_split(blk_add_trace_split, NULL); -	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); -	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); +	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);  	unregister_trace_block_plug(blk_add_trace_plug, NULL);  	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);  	unregister_trace_block_getrq(blk_add_trace_getrq, NULL); @@ -1060,6 +1065,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)  		goto out;  	} +	if (tc & BLK_TC_FLUSH) +		rwbs[i++] = 'F'; +  	if (tc & BLK_TC_DISCARD)  		rwbs[i++] = 'D';  	else if (tc & BLK_TC_WRITE) @@ -1069,10 +1077,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)  	else  		rwbs[i++] = 'N'; +	if (tc & BLK_TC_FUA) +		rwbs[i++] = 'F';  	if (tc & BLK_TC_AHEAD)  		rwbs[i++] = 'A'; -	if (tc & BLK_TC_BARRIER) -		rwbs[i++] = 'B';  	if (tc & BLK_TC_SYNC)  		rwbs[i++] = 'S';  	if (tc & BLK_TC_META) @@ -1138,7 +1146,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);  static int blk_log_action_classic(struct trace_iterator *iter, const char *act)  { -	char rwbs[6]; +	char rwbs[RWBS_LEN];  	unsigned long long ts  = iter->ts;  	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);  	unsigned secs	       = (unsigned long)ts; @@ -1154,7 +1162,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)  static int blk_log_action(struct trace_iterator *iter, const char *act)  { -	char rwbs[6]; +	char rwbs[RWBS_LEN];  	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);  	fill_rwbs(rwbs, t); @@ -1421,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)  	return print_one_line(iter, true);  } -static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	/* don't output context-info for blk_classic output */  	if (bit == TRACE_BLK_OPT_CLASSIC) { @@ -1484,6 +1493,9 @@ static int blk_trace_remove_queue(struct request_queue *q)  	if (atomic_dec_and_test(&blk_probes_ref))  		blk_unregister_tracepoints(); +	spin_lock_irq(&running_trace_lock); +	list_del(&bt->running_list); +	spin_unlock_irq(&running_trace_lock);  	blk_trace_free(bt);  	return 0;  } @@ -1567,7 +1579,7 @@ static const struct {  } mask_maps[] = {  	{ BLK_TC_READ,		"read"		},  	{ BLK_TC_WRITE,		"write"		}, -	{ BLK_TC_BARRIER,	"barrier"	}, +	{ BLK_TC_FLUSH,		"flush"		},  	{ BLK_TC_SYNC,		"sync"		},  	{ BLK_TC_QUEUE,		"queue"		},  	{ BLK_TC_REQUEUE,	"requeue"	}, @@ -1579,6 +1591,7 @@ static const struct {  	{ BLK_TC_META,		"meta"		},  	{ BLK_TC_DISCARD,	"discard"	},  	{ BLK_TC_DRV_DATA,	"drv_data"	}, +	{ BLK_TC_FUA,		"fua"		},  };  static int blk_trace_str2mask(const char *str) @@ -1794,6 +1807,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)  {  	int i = 0; +	if (rw & REQ_FLUSH) +		rwbs[i++] = 'F'; +  	if (rw & WRITE)  		rwbs[i++] = 'W';  	else if (rw & REQ_DISCARD) @@ -1803,6 +1819,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)  	else  		rwbs[i++] = 'N'; +	if (rw & REQ_FUA) +		rwbs[i++] = 'F';  	if (rw & REQ_RAHEAD)  		rwbs[i++] = 'A';  	if (rw & REQ_SYNC) @@ -1814,22 +1832,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)  	rwbs[i] = '\0';  } - -void blk_fill_rwbs_rq(char *rwbs, struct request *rq) -{ -	int rw = rq->cmd_flags & 0x03; -	int bytes; - -	if (rq->cmd_flags & REQ_DISCARD) -		rw |= REQ_DISCARD; - -	if (rq->cmd_flags & REQ_SECURE) -		rw |= REQ_SECURE; - -	bytes = blk_rq_bytes(rq); - -	blk_fill_rwbs(rwbs, rw, bytes); -} +EXPORT_SYMBOL_GPL(blk_fill_rwbs);  #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae8388..ac9d1dad630 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -10,7 +10,7 @@   * Based on code in the latency_tracer, that is:   *   *  Copyright (C) 2004-2006 Ingo Molnar - *  Copyright (C) 2004 William Lee Irwin III + *  Copyright (C) 2004 Nadia Yvette Chambers   */  #include <linux/stop_machine.h> @@ -22,44 +22,71 @@  #include <linux/hardirq.h>  #include <linux/kthread.h>  #include <linux/uaccess.h> +#include <linux/bsearch.h> +#include <linux/module.h>  #include <linux/ftrace.h>  #include <linux/sysctl.h>  #include <linux/slab.h>  #include <linux/ctype.h> +#include <linux/sort.h>  #include <linux/list.h>  #include <linux/hash.h>  #include <linux/rcupdate.h>  #include <trace/events/sched.h> -#include <asm/ftrace.h>  #include <asm/setup.h>  #include "trace_output.h"  #include "trace_stat.h"  #define FTRACE_WARN_ON(cond)			\ -	do {					\ -		if (WARN_ON(cond))		\ +	({					\ +		int ___r = cond;		\ +		if (WARN_ON(___r))		\  			ftrace_kill();		\ -	} while (0) +		___r;				\ +	})  #define FTRACE_WARN_ON_ONCE(cond)		\ -	do {					\ -		if (WARN_ON_ONCE(cond))		\ +	({					\ +		int ___r = cond;		\ +		if (WARN_ON_ONCE(___r))		\  			ftrace_kill();		\ -	} while (0) +		___r;				\ +	})  /* hash bits for specific function selection */  #define FTRACE_HASH_BITS 7  #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 + +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) + +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_REGEX_LOCK(opsname)	\ +	.regex_lock	= __MUTEX_INITIALIZER(opsname.regex_lock), +#else +#define INIT_REGEX_LOCK(opsname) +#endif + +static struct ftrace_ops ftrace_list_end __read_mostly = { +	.func		= ftrace_stub, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, +};  /* ftrace_enabled is a method to turn ftrace on or off */  int ftrace_enabled __read_mostly;  static int last_ftrace_enabled;  /* Quick disabling of function tracer. */ -int function_trace_stop; +int function_trace_stop __read_mostly; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op;  /* List for set_ftrace_pid's pids. */  LIST_HEAD(ftrace_pids); @@ -76,41 +103,80 @@ static int ftrace_disabled __read_mostly;  static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = -{ -	.func		= ftrace_stub, -}; - -static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;  ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;  ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; + +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, +				 struct ftrace_ops *op, struct pt_regs *regs); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif  /* - * Traverse the ftrace_list, invoking all entries.  The reason that we - * can use rcu_dereference_raw() is that elements removed from this list + * Traverse the ftrace_global_list, invoking all entries.  The reason that we + * can use rcu_dereference_raw_notrace() is that elements removed from this list   * are simply leaked, so there is no need to interact with a grace-period - * mechanism.  The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_list. + * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle + * concurrent insertions into the ftrace_global_list.   *   * Silly Alpha and silly pointer-speculation compiler optimizations!   */ -static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +#define do_for_each_ftrace_op(op, list)			\ +	op = rcu_dereference_raw_notrace(list);			\ +	do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op)				\ +	while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&	\ +	       unlikely((op) != &ftrace_list_end)) + +static inline void ftrace_ops_init(struct ftrace_ops *ops) +{ +#ifdef CONFIG_DYNAMIC_FTRACE +	if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { +		mutex_init(&ops->regex_lock); +		ops->flags |= FTRACE_OPS_FL_INITIALIZED; +	} +#endif +} + +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void)  { -	struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ +	struct ftrace_ops *ops; +	int cnt = 0; + +	mutex_lock(&ftrace_lock); + +	for (ops = ftrace_ops_list; +	     ops != &ftrace_list_end; ops = ops->next) +		cnt++; -	while (op != &ftrace_list_end) { -		op->func(ip, parent_ip); -		op = rcu_dereference_raw(op->next); /*see above*/ -	}; +	mutex_unlock(&ftrace_lock); + +	return cnt;  } -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, +			    struct ftrace_ops *op, struct pt_regs *regs)  {  	if (!test_tsk_trace_trace(current))  		return; -	ftrace_pid_function(ip, parent_ip); +	ftrace_pid_function(ip, parent_ip, op, regs);  }  static void set_ftrace_pid_function(ftrace_func_t func) @@ -129,64 +195,142 @@ static void set_ftrace_pid_function(ftrace_func_t func)  void clear_ftrace_function(void)  {  	ftrace_trace_function = ftrace_stub; -	__ftrace_trace_function = ftrace_stub;  	ftrace_pid_function = ftrace_stub;  } -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +static void control_ops_disable_all(struct ftrace_ops *ops)  { -	if (function_trace_stop) -		return; +	int cpu; -	__ftrace_trace_function(ip, parent_ip); +	for_each_possible_cpu(cpu) +		*per_cpu_ptr(ops->disabled, cpu) = 1;  } -#endif -static int __register_ftrace_function(struct ftrace_ops *ops) +static int control_ops_alloc(struct ftrace_ops *ops) +{ +	int __percpu *disabled; + +	disabled = alloc_percpu(int); +	if (!disabled) +		return -ENOMEM; + +	ops->disabled = disabled; +	control_ops_disable_all(ops); +	return 0; +} + +static void ftrace_sync(struct work_struct *work)  { -	ops->next = ftrace_list;  	/* -	 * We are entering ops into the ftrace_list but another -	 * CPU might be walking that list. We need to make sure -	 * the ops->next pointer is valid before another CPU sees -	 * the ops pointer included into the ftrace_list. +	 * This function is just a stub to implement a hard force +	 * of synchronize_sched(). This requires synchronizing +	 * tasks even in userspace and idle. +	 * +	 * Yes, function tracing is rude.  	 */ -	rcu_assign_pointer(ftrace_list, ops); +} -	if (ftrace_enabled) { -		ftrace_func_t func; +static void ftrace_sync_ipi(void *data) +{ +	/* Probably not needed, but do it anyway */ +	smp_rmb(); +} -		if (ops->next == &ftrace_list_end) -			func = ops->func; -		else -			func = ftrace_list_func; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif -		if (!list_empty(&ftrace_pids)) { -			set_ftrace_pid_function(func); -			func = ftrace_pid_func; -		} +static void update_ftrace_function(void) +{ +	ftrace_func_t func; + +	/* +	 * If we are at the end of the list and this ops is +	 * recursion safe and not dynamic and the arch supports passing ops, +	 * then have the mcount trampoline call the function directly. +	 */ +	if (ftrace_ops_list == &ftrace_list_end || +	    (ftrace_ops_list->next == &ftrace_list_end && +	     !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && +	     (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && +	     !FTRACE_FORCE_LIST_FUNC)) { +		/* Set the ftrace_ops that the arch callback uses */ +		set_function_trace_op = ftrace_ops_list; +		func = ftrace_ops_list->func; +	} else { +		/* Just use the default ftrace_ops */ +		set_function_trace_op = &ftrace_list_end; +		func = ftrace_ops_list_func; +	} + +	update_function_graph_func(); +	/* If there's no change, then do nothing more here */ +	if (ftrace_trace_function == func) +		return; + +	/* +	 * If we are using the list function, it doesn't care +	 * about the function_trace_ops. +	 */ +	if (func == ftrace_ops_list_func) { +		ftrace_trace_function = func;  		/* -		 * For one func, simply call it directly. -		 * For more than one func, call the chain. +		 * Don't even bother setting function_trace_ops, +		 * it would be racy to do so anyway.  		 */ -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -		ftrace_trace_function = func; -#else -		__ftrace_trace_function = func; -		ftrace_trace_function = ftrace_test_stop_func; -#endif +		return;  	} -	return 0; +#ifndef CONFIG_DYNAMIC_FTRACE +	/* +	 * For static tracing, we need to be a bit more careful. +	 * The function change takes affect immediately. Thus, +	 * we need to coorditate the setting of the function_trace_ops +	 * with the setting of the ftrace_trace_function. +	 * +	 * Set the function to the list ops, which will call the +	 * function we want, albeit indirectly, but it handles the +	 * ftrace_ops and doesn't depend on function_trace_op. +	 */ +	ftrace_trace_function = ftrace_ops_list_func; +	/* +	 * Make sure all CPUs see this. Yes this is slow, but static +	 * tracing is slow and nasty to have enabled. +	 */ +	schedule_on_each_cpu(ftrace_sync); +	/* Now all cpus are using the list ops. */ +	function_trace_op = set_function_trace_op; +	/* Make sure the function_trace_op is visible on all CPUs */ +	smp_wmb(); +	/* Nasty way to force a rmb on all cpus */ +	smp_call_function(ftrace_sync_ipi, NULL, 1); +	/* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +	ftrace_trace_function = func;  } -static int __unregister_ftrace_function(struct ftrace_ops *ops) +int using_ftrace_ops_list_func(void) +{ +	return ftrace_trace_function == ftrace_ops_list_func; +} + +static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +{ +	ops->next = *list; +	/* +	 * We are entering ops into the list but another +	 * CPU might be walking that list. We need to make sure +	 * the ops->next pointer is valid before another CPU sees +	 * the ops pointer included into the list. +	 */ +	rcu_assign_pointer(*list, ops); +} + +static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)  {  	struct ftrace_ops **p; @@ -194,13 +338,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  	 * If we are removing the last function, then simply point  	 * to the ftrace_stub.  	 */ -	if (ftrace_list == ops && ops->next == &ftrace_list_end) { -		ftrace_trace_function = ftrace_stub; -		ftrace_list = &ftrace_list_end; +	if (*list == ops && ops->next == &ftrace_list_end) { +		*list = &ftrace_list_end;  		return 0;  	} -	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) +	for (p = list; *p != &ftrace_list_end; p = &(*p)->next)  		if (*p == ops)  			break; @@ -208,53 +351,96 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  		return -1;  	*p = (*p)->next; +	return 0; +} -	if (ftrace_enabled) { -		/* If we only have one func left, then call that directly */ -		if (ftrace_list->next == &ftrace_list_end) { -			ftrace_func_t func = ftrace_list->func; +static void add_ftrace_list_ops(struct ftrace_ops **list, +				struct ftrace_ops *main_ops, +				struct ftrace_ops *ops) +{ +	int first = *list == &ftrace_list_end; +	add_ftrace_ops(list, ops); +	if (first) +		add_ftrace_ops(&ftrace_ops_list, main_ops); +} -			if (!list_empty(&ftrace_pids)) { -				set_ftrace_pid_function(func); -				func = ftrace_pid_func; -			} -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -			ftrace_trace_function = func; -#else -			__ftrace_trace_function = func; +static int remove_ftrace_list_ops(struct ftrace_ops **list, +				  struct ftrace_ops *main_ops, +				  struct ftrace_ops *ops) +{ +	int ret = remove_ftrace_ops(list, ops); +	if (!ret && *list == &ftrace_list_end) +		ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); +	return ret; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ +	if (ops->flags & FTRACE_OPS_FL_DELETED) +		return -EINVAL; + +	if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) +		return -EBUSY; + +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS +	/* +	 * If the ftrace_ops specifies SAVE_REGS, then it only can be used +	 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. +	 * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. +	 */ +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && +	    !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) +		return -EINVAL; + +	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) +		ops->flags |= FTRACE_OPS_FL_SAVE_REGS;  #endif -		} -	} + +	if (!core_kernel_data((unsigned long)ops)) +		ops->flags |= FTRACE_OPS_FL_DYNAMIC; + +	if (ops->flags & FTRACE_OPS_FL_CONTROL) { +		if (control_ops_alloc(ops)) +			return -ENOMEM; +		add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); +	} else +		add_ftrace_ops(&ftrace_ops_list, ops); + +	if (ftrace_enabled) +		update_ftrace_function();  	return 0;  } -static void ftrace_update_pid_func(void) +static int __unregister_ftrace_function(struct ftrace_ops *ops)  { -	ftrace_func_t func; +	int ret; -	if (ftrace_trace_function == ftrace_stub) -		return; +	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) +		return -EBUSY; -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -	func = ftrace_trace_function; -#else -	func = __ftrace_trace_function; -#endif +	if (ops->flags & FTRACE_OPS_FL_CONTROL) { +		ret = remove_ftrace_list_ops(&ftrace_control_list, +					     &control_ops, ops); +	} else +		ret = remove_ftrace_ops(&ftrace_ops_list, ops); -	if (!list_empty(&ftrace_pids)) { -		set_ftrace_pid_function(func); -		func = ftrace_pid_func; -	} else { -		if (func == ftrace_pid_func) -			func = ftrace_pid_function; -	} +	if (ret < 0) +		return ret; -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -	ftrace_trace_function = func; -#else -	__ftrace_trace_function = func; -#endif +	if (ftrace_enabled) +		update_ftrace_function(); + +	return 0; +} + +static void ftrace_update_pid_func(void) +{ +	/* Only do something if we are tracing something */ +	if (ftrace_trace_function == ftrace_stub) +		return; + +	update_ftrace_function();  }  #ifdef CONFIG_FUNCTION_PROFILER @@ -288,7 +474,6 @@ struct ftrace_profile_stat {  #define PROFILES_PER_PAGE					\  	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly;  static int ftrace_profile_enabled __read_mostly;  /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -296,7 +481,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);  static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)  static void *  function_stat_next(void *v, int idx) @@ -407,12 +593,18 @@ static int function_stat_show(struct seq_file *m, void *v)  	if (rec->counter <= 1)  		stddev = 0;  	else { -		stddev = rec->time_squared - rec->counter * avg * avg; +		/* +		 * Apply Welford's method: +		 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) +		 */ +		stddev = rec->counter * rec->time_squared - +			 rec->time * rec->time; +  		/*  		 * Divide only 1000 for ns^2 -> us^2 conversion.  		 * trace_print_graph_duration will divide 1000 again.  		 */ -		do_div(stddev, (rec->counter - 1) * 1000); +		do_div(stddev, rec->counter * (rec->counter - 1) * 1000);  	}  	trace_seq_init(&s); @@ -478,7 +670,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); -	for (i = 0; i < pages; i++) { +	for (i = 1; i < pages; i++) {  		pg->next = (void *)get_zeroed_page(GFP_KERNEL);  		if (!pg->next)  			goto out_free; @@ -496,7 +688,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)  		free_page(tmp);  	} -	free_page((unsigned long)stat->pages);  	stat->pages = NULL;  	stat->start = NULL; @@ -527,13 +718,6 @@ static int ftrace_profile_init_cpu(int cpu)  	if (!stat->hash)  		return -ENOMEM; -	if (!ftrace_profile_bits) { -		size--; - -		for (; size; size >>= 1) -			ftrace_profile_bits++; -	} -  	/* Preallocate the function profiling pages */  	if (ftrace_profile_pages_init(stat) < 0) {  		kfree(stat->hash); @@ -549,7 +733,7 @@ static int ftrace_profile_init(void)  	int cpu;  	int ret = 0; -	for_each_online_cpu(cpu) { +	for_each_possible_cpu(cpu) {  		ret = ftrace_profile_init_cpu(cpu);  		if (ret)  			break; @@ -564,16 +748,15 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)  {  	struct ftrace_profile *rec;  	struct hlist_head *hhd; -	struct hlist_node *n;  	unsigned long key; -	key = hash_long(ip, ftrace_profile_bits); +	key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);  	hhd = &stat->hash[key];  	if (hlist_empty(hhd))  		return NULL; -	hlist_for_each_entry_rcu(rec, n, hhd, node) { +	hlist_for_each_entry_rcu_notrace(rec, hhd, node) {  		if (rec->ip == ip)  			return rec;  	} @@ -586,7 +769,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,  {  	unsigned long key; -	key = hash_long(rec->ip, ftrace_profile_bits); +	key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);  	hlist_add_head_rcu(&rec->node, &stat->hash[key]);  } @@ -627,7 +810,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)  }  static void -function_profile_call(unsigned long ip, unsigned long parent_ip) +function_profile_call(unsigned long ip, unsigned long parent_ip, +		      struct ftrace_ops *ops, struct pt_regs *regs)  {  	struct ftrace_profile_stat *stat;  	struct ftrace_profile *rec; @@ -638,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)  	local_irq_save(flags); -	stat = &__get_cpu_var(ftrace_profile_stats); +	stat = this_cpu_ptr(&ftrace_profile_stats);  	if (!stat->hash || !ftrace_profile_enabled)  		goto out; @@ -657,7 +841,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static int profile_graph_entry(struct ftrace_graph_ent *trace)  { -	function_profile_call(trace->func, 0); +	function_profile_call(trace->func, 0, NULL, NULL);  	return 1;  } @@ -669,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)  	unsigned long flags;  	local_irq_save(flags); -	stat = &__get_cpu_var(ftrace_profile_stats); +	stat = this_cpu_ptr(&ftrace_profile_stats);  	if (!stat->hash || !ftrace_profile_enabled)  		goto out; @@ -715,9 +899,10 @@ static void unregister_ftrace_profiler(void)  	unregister_ftrace_graph();  }  #else -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ +static struct ftrace_ops ftrace_profile_ops __read_mostly = {  	.func		= function_profile_call, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(ftrace_profile_ops)  };  static int register_ftrace_profiler(void) @@ -736,19 +921,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,  		     size_t cnt, loff_t *ppos)  {  	unsigned long val; -	char buf[64];		/* big enough to hold a number */  	int ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret)  		return ret;  	val = !!val; @@ -878,31 +1054,50 @@ struct ftrace_func_probe {  	unsigned long		flags;  	unsigned long		ip;  	void			*data; -	struct rcu_head		rcu; +	struct list_head	free_list;  }; -enum { -	FTRACE_ENABLE_CALLS		= (1 << 0), -	FTRACE_DISABLE_CALLS		= (1 << 1), -	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2), -	FTRACE_START_FUNC_RET		= (1 << 3), -	FTRACE_STOP_FUNC_RET		= (1 << 4), +struct ftrace_func_entry { +	struct hlist_node hlist; +	unsigned long ip;  }; -static int ftrace_filtered; - -static struct dyn_ftrace *ftrace_new_addrs; +struct ftrace_hash { +	unsigned long		size_bits; +	struct hlist_head	*buckets; +	unsigned long		count; +	struct rcu_head		rcu; +}; -static DEFINE_MUTEX(ftrace_regex_lock); +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { +	.buckets = (struct hlist_head *)empty_buckets, +}; +#define EMPTY_HASH	((struct ftrace_hash *)&empty_hash) + +static struct ftrace_ops global_ops = { +	.func			= ftrace_stub, +	.notrace_hash		= EMPTY_HASH, +	.filter_hash		= EMPTY_HASH, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(global_ops) +};  struct ftrace_page {  	struct ftrace_page	*next; +	struct dyn_ftrace	*records;  	int			index; -	struct dyn_ftrace	records[]; +	int			size;  }; -#define ENTRIES_PER_PAGE \ -  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)  /* estimate from running different kernels */  #define NR_TO_INIT		10000 @@ -910,7 +1105,308 @@ struct ftrace_page {  static struct ftrace_page	*ftrace_pages_start;  static struct ftrace_page	*ftrace_pages; -static struct dyn_ftrace *ftrace_free_records; +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +{ +	return !hash || !hash->count; +} + +static struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ +	unsigned long key; +	struct ftrace_func_entry *entry; +	struct hlist_head *hhd; + +	if (ftrace_hash_empty(hash)) +		return NULL; + +	if (hash->size_bits > 0) +		key = hash_long(ip, hash->size_bits); +	else +		key = 0; + +	hhd = &hash->buckets[key]; + +	hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { +		if (entry->ip == ip) +			return entry; +	} +	return NULL; +} + +static void __add_hash_entry(struct ftrace_hash *hash, +			     struct ftrace_func_entry *entry) +{ +	struct hlist_head *hhd; +	unsigned long key; + +	if (hash->size_bits) +		key = hash_long(entry->ip, hash->size_bits); +	else +		key = 0; + +	hhd = &hash->buckets[key]; +	hlist_add_head(&entry->hlist, hhd); +	hash->count++; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ +	struct ftrace_func_entry *entry; + +	entry = kmalloc(sizeof(*entry), GFP_KERNEL); +	if (!entry) +		return -ENOMEM; + +	entry->ip = ip; +	__add_hash_entry(hash, entry); + +	return 0; +} + +static void +free_hash_entry(struct ftrace_hash *hash, +		  struct ftrace_func_entry *entry) +{ +	hlist_del(&entry->hlist); +	kfree(entry); +	hash->count--; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, +		  struct ftrace_func_entry *entry) +{ +	hlist_del(&entry->hlist); +	hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ +	struct hlist_head *hhd; +	struct hlist_node *tn; +	struct ftrace_func_entry *entry; +	int size = 1 << hash->size_bits; +	int i; + +	if (!hash->count) +		return; + +	for (i = 0; i < size; i++) { +		hhd = &hash->buckets[i]; +		hlist_for_each_entry_safe(entry, tn, hhd, hlist) +			free_hash_entry(hash, entry); +	} +	FTRACE_WARN_ON(hash->count); +} + +static void free_ftrace_hash(struct ftrace_hash *hash) +{ +	if (!hash || hash == EMPTY_HASH) +		return; +	ftrace_hash_clear(hash); +	kfree(hash->buckets); +	kfree(hash); +} + +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ +	struct ftrace_hash *hash; + +	hash = container_of(rcu, struct ftrace_hash, rcu); +	free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ +	if (!hash || hash == EMPTY_HASH) +		return; +	call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); +} + +void ftrace_free_filter(struct ftrace_ops *ops) +{ +	ftrace_ops_init(ops); +	free_ftrace_hash(ops->filter_hash); +	free_ftrace_hash(ops->notrace_hash); +} + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ +	struct ftrace_hash *hash; +	int size; + +	hash = kzalloc(sizeof(*hash), GFP_KERNEL); +	if (!hash) +		return NULL; + +	size = 1 << size_bits; +	hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); + +	if (!hash->buckets) { +		kfree(hash); +		return NULL; +	} + +	hash->size_bits = size_bits; + +	return hash; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ +	struct ftrace_func_entry *entry; +	struct ftrace_hash *new_hash; +	int size; +	int ret; +	int i; + +	new_hash = alloc_ftrace_hash(size_bits); +	if (!new_hash) +		return NULL; + +	/* Empty hash? */ +	if (ftrace_hash_empty(hash)) +		return new_hash; + +	size = 1 << hash->size_bits; +	for (i = 0; i < size; i++) { +		hlist_for_each_entry(entry, &hash->buckets[i], hlist) { +			ret = add_hash_entry(new_hash, entry->ip); +			if (ret < 0) +				goto free_hash; +		} +	} + +	FTRACE_WARN_ON(new_hash->count != hash->count); + +	return new_hash; + + free_hash: +	free_ftrace_hash(new_hash); +	return NULL; +} + +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, +		 struct ftrace_hash **dst, struct ftrace_hash *src) +{ +	struct ftrace_func_entry *entry; +	struct hlist_node *tn; +	struct hlist_head *hhd; +	struct ftrace_hash *old_hash; +	struct ftrace_hash *new_hash; +	int size = src->count; +	int bits = 0; +	int ret; +	int i; + +	/* +	 * Remove the current set, update the hash and add +	 * them back. +	 */ +	ftrace_hash_rec_disable(ops, enable); + +	/* +	 * If the new source is empty, just free dst and assign it +	 * the empty_hash. +	 */ +	if (!src->count) { +		free_ftrace_hash_rcu(*dst); +		rcu_assign_pointer(*dst, EMPTY_HASH); +		/* still need to update the function records */ +		ret = 0; +		goto out; +	} + +	/* +	 * Make the hash size about 1/2 the # found +	 */ +	for (size /= 2; size; size >>= 1) +		bits++; + +	/* Don't allocate too much */ +	if (bits > FTRACE_HASH_MAX_BITS) +		bits = FTRACE_HASH_MAX_BITS; + +	ret = -ENOMEM; +	new_hash = alloc_ftrace_hash(bits); +	if (!new_hash) +		goto out; + +	size = 1 << src->size_bits; +	for (i = 0; i < size; i++) { +		hhd = &src->buckets[i]; +		hlist_for_each_entry_safe(entry, tn, hhd, hlist) { +			remove_hash_entry(src, entry); +			__add_hash_entry(new_hash, entry); +		} +	} + +	old_hash = *dst; +	rcu_assign_pointer(*dst, new_hash); +	free_ftrace_hash_rcu(old_hash); + +	ret = 0; + out: +	/* +	 * Enable regardless of ret: +	 *  On success, we enable the new hash. +	 *  On failure, we re-enable the original hash. +	 */ +	ftrace_hash_rec_enable(ops, enable); + +	return ret; +} + +/* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + *  AND + * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu_sched(). + */ +static int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ +	struct ftrace_hash *filter_hash; +	struct ftrace_hash *notrace_hash; +	int ret; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +	/* +	 * There's a small race when adding ops that the ftrace handler +	 * that wants regs, may be called without them. We can not +	 * allow that handler to be called if regs is NULL. +	 */ +	if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) +		return 0; +#endif + +	filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); +	notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); + +	if ((ftrace_hash_empty(filter_hash) || +	     ftrace_lookup_ip(filter_hash, ip)) && +	    (ftrace_hash_empty(notrace_hash) || +	     !ftrace_lookup_ip(notrace_hash, ip))) +		ret = 1; +	else +		ret = 0; + +	return ret; +}  /*   * This is a double for. Do not use 'break' to break out of the loop, @@ -926,63 +1422,186 @@ static struct dyn_ftrace *ftrace_free_records;  		}				\  	} -static void ftrace_free_rec(struct dyn_ftrace *rec) + +static int ftrace_cmp_recs(const void *a, const void *b)  { -	rec->freelist = ftrace_free_records; -	ftrace_free_records = rec; -	rec->flags |= FTRACE_FL_FREE; +	const struct dyn_ftrace *key = a; +	const struct dyn_ftrace *rec = b; + +	if (key->flags < rec->ip) +		return -1; +	if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) +		return 1; +	return 0;  } -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +static unsigned long ftrace_location_range(unsigned long start, unsigned long end)  { +	struct ftrace_page *pg;  	struct dyn_ftrace *rec; +	struct dyn_ftrace key; -	/* First check for freed records */ -	if (ftrace_free_records) { -		rec = ftrace_free_records; - -		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { -			FTRACE_WARN_ON_ONCE(1); -			ftrace_free_records = NULL; -			return NULL; -		} +	key.ip = start; +	key.flags = end;	/* overload flags, as it is unsigned long */ -		ftrace_free_records = rec->freelist; -		memset(rec, 0, sizeof(*rec)); -		return rec; +	for (pg = ftrace_pages_start; pg; pg = pg->next) { +		if (end < pg->records[0].ip || +		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) +			continue; +		rec = bsearch(&key, pg->records, pg->index, +			      sizeof(struct dyn_ftrace), +			      ftrace_cmp_recs); +		if (rec) +			return rec->ip;  	} -	if (ftrace_pages->index == ENTRIES_PER_PAGE) { -		if (!ftrace_pages->next) { -			/* allocate another page */ -			ftrace_pages->next = -				(void *)get_zeroed_page(GFP_KERNEL); -			if (!ftrace_pages->next) -				return NULL; -		} -		ftrace_pages = ftrace_pages->next; -	} +	return 0; +} -	return &ftrace_pages->records[ftrace_pages->index++]; +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns rec->ip if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +unsigned long ftrace_location(unsigned long ip) +{ +	return ftrace_location_range(ip, ip);  } -static struct dyn_ftrace * -ftrace_record_ip(unsigned long ip) +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(const void *start, const void *end)  { +	unsigned long ret; + +	ret = ftrace_location_range((unsigned long)start, +				    (unsigned long)end); + +	return (int)!!ret; +} + +static void __ftrace_hash_rec_update(struct ftrace_ops *ops, +				     int filter_hash, +				     bool inc) +{ +	struct ftrace_hash *hash; +	struct ftrace_hash *other_hash; +	struct ftrace_page *pg;  	struct dyn_ftrace *rec; +	int count = 0; +	int all = 0; -	if (ftrace_disabled) -		return NULL; +	/* Only update if the ops has been registered */ +	if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) +		return; -	rec = ftrace_alloc_dyn_node(ip); -	if (!rec) -		return NULL; +	/* +	 * In the filter_hash case: +	 *   If the count is zero, we update all records. +	 *   Otherwise we just update the items in the hash. +	 * +	 * In the notrace_hash case: +	 *   We enable the update in the hash. +	 *   As disabling notrace means enabling the tracing, +	 *   and enabling notrace means disabling, the inc variable +	 *   gets inversed. +	 */ +	if (filter_hash) { +		hash = ops->filter_hash; +		other_hash = ops->notrace_hash; +		if (ftrace_hash_empty(hash)) +			all = 1; +	} else { +		inc = !inc; +		hash = ops->notrace_hash; +		other_hash = ops->filter_hash; +		/* +		 * If the notrace hash has no items, +		 * then there's nothing to do. +		 */ +		if (ftrace_hash_empty(hash)) +			return; +	} -	rec->ip = ip; -	rec->newlist = ftrace_new_addrs; -	ftrace_new_addrs = rec; +	do_for_each_ftrace_rec(pg, rec) { +		int in_other_hash = 0; +		int in_hash = 0; +		int match = 0; -	return rec; +		if (all) { +			/* +			 * Only the filter_hash affects all records. +			 * Update if the record is not in the notrace hash. +			 */ +			if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) +				match = 1; +		} else { +			in_hash = !!ftrace_lookup_ip(hash, rec->ip); +			in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + +			/* +			 * If filter_hash is set, we want to match all functions +			 * that are in the hash but not in the other hash. +			 * +			 * If filter_hash is not set, then we are decrementing. +			 * That means we match anything that is in the hash +			 * and also in the other_hash. That is, we need to turn +			 * off functions in the other hash because they are disabled +			 * by this hash. +			 */ +			if (filter_hash && in_hash && !in_other_hash) +				match = 1; +			else if (!filter_hash && in_hash && +				 (in_other_hash || ftrace_hash_empty(other_hash))) +				match = 1; +		} +		if (!match) +			continue; + +		if (inc) { +			rec->flags++; +			if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) +				return; +			/* +			 * If any ops wants regs saved for this function +			 * then all ops will get saved regs. +			 */ +			if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) +				rec->flags |= FTRACE_FL_REGS; +		} else { +			if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) +				return; +			rec->flags--; +		} +		count++; +		/* Shortcut, if we handled all records, we are done. */ +		if (!all && count == hash->count) +			return; +	} while_for_each_ftrace_rec(); +} + +static void ftrace_hash_rec_disable(struct ftrace_ops *ops, +				    int filter_hash) +{ +	__ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable(struct ftrace_ops *ops, +				   int filter_hash) +{ +	__ftrace_hash_rec_update(ops, filter_hash, 1);  }  static void print_ip_ins(const char *fmt, unsigned char *p) @@ -995,7 +1614,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)  		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);  } -static void ftrace_bug(int failed, unsigned long ip) +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @ip: The address that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writting to the @ip address + */ +void ftrace_bug(int failed, unsigned long ip)  {  	switch (failed) {  	case -EFAULT: @@ -1022,76 +1653,183 @@ static void ftrace_bug(int failed, unsigned long ip)  	}  } +static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) +{ +	unsigned long flag = 0UL; + +	/* +	 * If we are updating calls: +	 * +	 *   If the record has a ref count, then we need to enable it +	 *   because someone is using it. +	 * +	 *   Otherwise we make sure its disabled. +	 * +	 * If we are disabling calls, then disable all records that +	 * are enabled. +	 */ +	if (enable && (rec->flags & ~FTRACE_FL_MASK)) +		flag = FTRACE_FL_ENABLED; + +	/* +	 * If enabling and the REGS flag does not match the REGS_EN, then +	 * do not ignore this record. Set flags to fail the compare against +	 * ENABLED. +	 */ +	if (flag && +	    (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) +		flag |= FTRACE_FL_REGS; + +	/* If the state of this record hasn't changed, then do nothing */ +	if ((rec->flags & FTRACE_FL_ENABLED) == flag) +		return FTRACE_UPDATE_IGNORE; + +	if (flag) { +		/* Save off if rec is being enabled (for return value) */ +		flag ^= rec->flags & FTRACE_FL_ENABLED; + +		if (update) { +			rec->flags |= FTRACE_FL_ENABLED; +			if (flag & FTRACE_FL_REGS) { +				if (rec->flags & FTRACE_FL_REGS) +					rec->flags |= FTRACE_FL_REGS_EN; +				else +					rec->flags &= ~FTRACE_FL_REGS_EN; +			} +		} + +		/* +		 * If this record is being updated from a nop, then +		 *   return UPDATE_MAKE_CALL. +		 * Otherwise, +		 *   return UPDATE_MODIFY_CALL to tell the caller to convert +		 *   from the save regs, to a non-save regs function or +		 *   vice versa. +		 */ +		if (flag & FTRACE_FL_ENABLED) +			return FTRACE_UPDATE_MAKE_CALL; + +		return FTRACE_UPDATE_MODIFY_CALL; +	} + +	if (update) { +		/* If there's no more users, clear all flags */ +		if (!(rec->flags & ~FTRACE_FL_MASK)) +			rec->flags = 0; +		else +			/* Just disable the record (keep REGS state) */ +			rec->flags &= ~FTRACE_FL_ENABLED; +	} -/* Return 1 if the address range is reserved for ftrace */ -int ftrace_text_reserved(void *start, void *end) +	return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record, set a record that now is tracing or not + * @rec: the record to update + * @enable: set to 1 if the record is tracing, zero to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, int enable)  { -	struct dyn_ftrace *rec; -	struct ftrace_page *pg; +	return ftrace_check_record(rec, enable, 1); +} -	do_for_each_ftrace_rec(pg, rec) { -		if (rec->ip <= (unsigned long)end && -		    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) -			return 1; -	} while_for_each_ftrace_rec(); -	return 0; +/** + * ftrace_test_record, check if the record has been enabled or not + * @rec: the record to test + * @enable: set to 1 to check if enabled, 0 if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, int enable) +{ +	return ftrace_check_record(rec, enable, 0);  } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec:  The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ +	if (rec->flags & FTRACE_FL_REGS) +		return (unsigned long)FTRACE_REGS_ADDR; +	else +		return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec:  The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ +	if (rec->flags & FTRACE_FL_REGS_EN) +		return (unsigned long)FTRACE_REGS_ADDR; +	else +		return (unsigned long)FTRACE_ADDR; +}  static int  __ftrace_replace_code(struct dyn_ftrace *rec, int enable)  { +	unsigned long ftrace_old_addr;  	unsigned long ftrace_addr; -	unsigned long flag = 0UL; +	int ret; -	ftrace_addr = (unsigned long)FTRACE_ADDR; +	ftrace_addr = ftrace_get_addr_new(rec); -	/* -	 * If this record is not to be traced or we want to disable it, -	 * then disable it. -	 * -	 * If we want to enable it and filtering is off, then enable it. -	 * -	 * If we want to enable it and filtering is on, enable it only if -	 * it's filtered -	 */ -	if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { -		if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) -			flag = FTRACE_FL_ENABLED; -	} +	/* This needs to be done before we call ftrace_update_record */ +	ftrace_old_addr = ftrace_get_addr_curr(rec); -	/* If the state of this record hasn't changed, then do nothing */ -	if ((rec->flags & FTRACE_FL_ENABLED) == flag) +	ret = ftrace_update_record(rec, enable); + +	switch (ret) { +	case FTRACE_UPDATE_IGNORE:  		return 0; -	if (flag) { -		rec->flags |= FTRACE_FL_ENABLED; +	case FTRACE_UPDATE_MAKE_CALL:  		return ftrace_make_call(rec, ftrace_addr); + +	case FTRACE_UPDATE_MAKE_NOP: +		return ftrace_make_nop(NULL, rec, ftrace_addr); + +	case FTRACE_UPDATE_MODIFY_CALL: +		return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);  	} -	rec->flags &= ~FTRACE_FL_ENABLED; -	return ftrace_make_nop(NULL, rec, ftrace_addr); +	return -1; /* unknow ftrace bug */  } -static void ftrace_replace_code(int enable) +void __weak ftrace_replace_code(int enable)  {  	struct dyn_ftrace *rec;  	struct ftrace_page *pg;  	int failed; -	do_for_each_ftrace_rec(pg, rec) { -		/* -		 * Skip over free records, records that have -		 * failed and not converted. -		 */ -		if (rec->flags & FTRACE_FL_FREE || -		    rec->flags & FTRACE_FL_FAILED || -		    !(rec->flags & FTRACE_FL_CONVERTED)) -			continue; +	if (unlikely(ftrace_disabled)) +		return; +	do_for_each_ftrace_rec(pg, rec) {  		failed = __ftrace_replace_code(rec, enable);  		if (failed) { -			rec->flags |= FTRACE_FL_FAILED;  			ftrace_bug(failed, rec->ip);  			/* Stop processing */  			return; @@ -1099,6 +1837,78 @@ static void ftrace_replace_code(int enable)  	} while_for_each_ftrace_rec();  } +struct ftrace_rec_iter { +	struct ftrace_page	*pg; +	int			index; +}; + +/** + * ftrace_rec_iter_start, start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ +	/* +	 * We only use a single iterator. +	 * Protected by the ftrace_lock mutex. +	 */ +	static struct ftrace_rec_iter ftrace_rec_iter; +	struct ftrace_rec_iter *iter = &ftrace_rec_iter; + +	iter->pg = ftrace_pages_start; +	iter->index = 0; + +	/* Could have empty pages */ +	while (iter->pg && !iter->pg->index) +		iter->pg = iter->pg->next; + +	if (!iter->pg) +		return NULL; + +	return iter; +} + +/** + * ftrace_rec_iter_next, get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ +	iter->index++; + +	if (iter->index >= iter->pg->index) { +		iter->pg = iter->pg->next; +		iter->index = 0; + +		/* Could have empty pages */ +		while (iter->pg && !iter->pg->index) +			iter->pg = iter->pg->next; +	} + +	if (!iter->pg) +		return NULL; + +	return iter; +} + +/** + * ftrace_rec_iter_record, get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ +	return &iter->pg->records[iter->index]; +} +  static int  ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)  { @@ -1107,10 +1917,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)  	ip = rec->ip; +	if (unlikely(ftrace_disabled)) +		return 0; +  	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);  	if (ret) {  		ftrace_bug(ret, ip); -		rec->flags |= FTRACE_FL_FAILED;  		return 0;  	}  	return 1; @@ -1134,26 +1946,83 @@ int __weak ftrace_arch_code_modify_post_process(void)  	return 0;  } -static int __ftrace_modify_code(void *data) +void ftrace_modify_all_code(int command)  { -	int *command = data; +	int update = command & FTRACE_UPDATE_TRACE_FUNC; +	int err = 0; + +	/* +	 * If the ftrace_caller calls a ftrace_ops func directly, +	 * we need to make sure that it only traces functions it +	 * expects to trace. When doing the switch of functions, +	 * we need to update to the ftrace_ops_list_func first +	 * before the transition between old and new calls are set, +	 * as the ftrace_ops_list_func will check the ops hashes +	 * to make sure the ops are having the right functions +	 * traced. +	 */ +	if (update) { +		err = ftrace_update_ftrace_func(ftrace_ops_list_func); +		if (FTRACE_WARN_ON(err)) +			return; +	} -	if (*command & FTRACE_ENABLE_CALLS) +	if (command & FTRACE_UPDATE_CALLS)  		ftrace_replace_code(1); -	else if (*command & FTRACE_DISABLE_CALLS) +	else if (command & FTRACE_DISABLE_CALLS)  		ftrace_replace_code(0); -	if (*command & FTRACE_UPDATE_TRACE_FUNC) -		ftrace_update_ftrace_func(ftrace_trace_function); +	if (update && ftrace_trace_function != ftrace_ops_list_func) { +		function_trace_op = set_function_trace_op; +		smp_wmb(); +		/* If irqs are disabled, we are in stop machine */ +		if (!irqs_disabled()) +			smp_call_function(ftrace_sync_ipi, NULL, 1); +		err = ftrace_update_ftrace_func(ftrace_trace_function); +		if (FTRACE_WARN_ON(err)) +			return; +	} -	if (*command & FTRACE_START_FUNC_RET) -		ftrace_enable_ftrace_graph_caller(); -	else if (*command & FTRACE_STOP_FUNC_RET) -		ftrace_disable_ftrace_graph_caller(); +	if (command & FTRACE_START_FUNC_RET) +		err = ftrace_enable_ftrace_graph_caller(); +	else if (command & FTRACE_STOP_FUNC_RET) +		err = ftrace_disable_ftrace_graph_caller(); +	FTRACE_WARN_ON(err); +} + +static int __ftrace_modify_code(void *data) +{ +	int *command = data; + +	ftrace_modify_all_code(*command);  	return 0;  } +/** + * ftrace_run_stop_machine, go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ +	stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code, modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ +	ftrace_run_stop_machine(command); +} +  static void ftrace_run_update_code(int command)  {  	int ret; @@ -1162,8 +2031,21 @@ static void ftrace_run_update_code(int command)  	FTRACE_WARN_ON(ret);  	if (ret)  		return; +	/* +	 * Do not call function tracer while we update the code. +	 * We are in stop machine. +	 */ +	function_trace_stop++; -	stop_machine(__ftrace_modify_code, &command, NULL); +	/* +	 * By default we use stop_machine() to modify the code. +	 * But archs can do what ever they want as long as it +	 * is safe. The stop_machine() is the safest, but also +	 * produces the most overhead. +	 */ +	arch_ftrace_update_code(command); + +	function_trace_stop--;  	ret = ftrace_arch_code_modify_post_process();  	FTRACE_WARN_ON(ret); @@ -1171,6 +2053,12 @@ static void ftrace_run_update_code(int command)  static ftrace_func_t saved_ftrace_func;  static int ftrace_start_up; +static int global_start_up; + +static void control_ops_free(struct ftrace_ops *ops) +{ +	free_percpu(ops->disabled); +}  static void ftrace_startup_enable(int command)  { @@ -1185,21 +2073,39 @@ static void ftrace_startup_enable(int command)  	ftrace_run_update_code(command);  } -static void ftrace_startup(int command) +static int ftrace_startup(struct ftrace_ops *ops, int command)  { +	int ret; +  	if (unlikely(ftrace_disabled)) -		return; +		return -ENODEV; + +	ret = __register_ftrace_function(ops); +	if (ret) +		return ret;  	ftrace_start_up++; -	command |= FTRACE_ENABLE_CALLS; +	command |= FTRACE_UPDATE_CALLS; + +	ops->flags |= FTRACE_OPS_FL_ENABLED; + +	ftrace_hash_rec_enable(ops, 1);  	ftrace_startup_enable(command); + +	return 0;  } -static void ftrace_shutdown(int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command)  { +	int ret; +  	if (unlikely(ftrace_disabled)) -		return; +		return -ENODEV; + +	ret = __unregister_ftrace_function(ops); +	if (ret) +		return ret;  	ftrace_start_up--;  	/* @@ -1209,18 +2115,54 @@ static void ftrace_shutdown(int command)  	 */  	WARN_ON_ONCE(ftrace_start_up < 0); -	if (!ftrace_start_up) -		command |= FTRACE_DISABLE_CALLS; +	ftrace_hash_rec_disable(ops, 1); + +	if (!global_start_up) +		ops->flags &= ~FTRACE_OPS_FL_ENABLED; + +	command |= FTRACE_UPDATE_CALLS;  	if (saved_ftrace_func != ftrace_trace_function) {  		saved_ftrace_func = ftrace_trace_function;  		command |= FTRACE_UPDATE_TRACE_FUNC;  	} -	if (!command || !ftrace_enabled) -		return; +	if (!command || !ftrace_enabled) { +		/* +		 * If these are control ops, they still need their +		 * per_cpu field freed. Since, function tracing is +		 * not currently active, we can just free them +		 * without synchronizing all CPUs. +		 */ +		if (ops->flags & FTRACE_OPS_FL_CONTROL) +			control_ops_free(ops); +		return 0; +	}  	ftrace_run_update_code(command); + +	/* +	 * Dynamic ops may be freed, we must make sure that all +	 * callers are done before leaving this function. +	 * The same goes for freeing the per_cpu data of the control +	 * ops. +	 * +	 * Again, normal synchronize_sched() is not good enough. +	 * We need to do a hard force of sched synchronization. +	 * This is because we use preempt_disable() to do RCU, but +	 * the function tracers can be called where RCU is not watching +	 * (like before user_exit()). We can not rely on the RCU +	 * infrastructure to do the synchronization, thus we must do it +	 * ourselves. +	 */ +	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { +		schedule_on_each_cpu(ftrace_sync); + +		if (ops->flags & FTRACE_OPS_FL_CONTROL) +			control_ops_free(ops); +	} + +	return 0;  }  static void ftrace_startup_sysctl(void) @@ -1232,7 +2174,7 @@ static void ftrace_startup_sysctl(void)  	saved_ftrace_func = NULL;  	/* ftrace_start_up is true if we want ftrace running */  	if (ftrace_start_up) -		ftrace_run_update_code(FTRACE_ENABLE_CALLS); +		ftrace_run_update_code(FTRACE_UPDATE_CALLS);  }  static void ftrace_shutdown_sysctl(void) @@ -1246,115 +2188,226 @@ static void ftrace_shutdown_sysctl(void)  }  static cycle_t		ftrace_update_time; -static unsigned long	ftrace_update_cnt;  unsigned long		ftrace_update_tot_cnt; -static int ftrace_update_code(struct module *mod) +static inline int ops_traces_mod(struct ftrace_ops *ops) +{ +	/* +	 * Filter_hash being empty will default to trace module. +	 * But notrace hash requires a test of individual module functions. +	 */ +	return ftrace_hash_empty(ops->filter_hash) && +		ftrace_hash_empty(ops->notrace_hash); +} + +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)  { +	/* If ops isn't enabled, ignore it */ +	if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) +		return 0; + +	/* If ops traces all mods, we already accounted for it */ +	if (ops_traces_mod(ops)) +		return 0; + +	/* The function must be in the filter */ +	if (!ftrace_hash_empty(ops->filter_hash) && +	    !ftrace_lookup_ip(ops->filter_hash, rec->ip)) +		return 0; + +	/* If in notrace hash, we ignore it too */ +	if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) +		return 0; + +	return 1; +} + +static int referenced_filters(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *ops; +	int cnt = 0; + +	for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { +		if (ops_references_rec(ops, rec)) +		    cnt++; +	} + +	return cnt; +} + +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) +{ +	struct ftrace_page *pg;  	struct dyn_ftrace *p;  	cycle_t start, stop; +	unsigned long update_cnt = 0; +	unsigned long ref = 0; +	bool test = false; +	int i; + +	/* +	 * When adding a module, we need to check if tracers are +	 * currently enabled and if they are set to trace all functions. +	 * If they are, we need to enable the module functions as well +	 * as update the reference counts for those function records. +	 */ +	if (mod) { +		struct ftrace_ops *ops; + +		for (ops = ftrace_ops_list; +		     ops != &ftrace_list_end; ops = ops->next) { +			if (ops->flags & FTRACE_OPS_FL_ENABLED) { +				if (ops_traces_mod(ops)) +					ref++; +				else +					test = true; +			} +		} +	}  	start = ftrace_now(raw_smp_processor_id()); -	ftrace_update_cnt = 0; -	while (ftrace_new_addrs) { +	for (pg = new_pgs; pg; pg = pg->next) { -		/* If something went wrong, bail without enabling anything */ -		if (unlikely(ftrace_disabled)) -			return -1; +		for (i = 0; i < pg->index; i++) { +			int cnt = ref; -		p = ftrace_new_addrs; -		ftrace_new_addrs = p->newlist; -		p->flags = 0L; +			/* If something went wrong, bail without enabling anything */ +			if (unlikely(ftrace_disabled)) +				return -1; -		/* -		 * Do the initial record convertion from mcount jump -		 * to the NOP instructions. -		 */ -		if (!ftrace_code_disable(mod, p)) { -			ftrace_free_rec(p); -			continue; -		} +			p = &pg->records[i]; +			if (test) +				cnt += referenced_filters(p); +			p->flags = cnt; -		p->flags |= FTRACE_FL_CONVERTED; -		ftrace_update_cnt++; +			/* +			 * Do the initial record conversion from mcount jump +			 * to the NOP instructions. +			 */ +			if (!ftrace_code_disable(mod, p)) +				break; -		/* -		 * If the tracing is enabled, go ahead and enable the record. -		 * -		 * The reason not to enable the record immediatelly is the -		 * inherent check of ftrace_make_nop/ftrace_make_call for -		 * correct previous instructions.  Making first the NOP -		 * conversion puts the module to the correct state, thus -		 * passing the ftrace_make_call check. -		 */ -		if (ftrace_start_up) { -			int failed = __ftrace_replace_code(p, 1); -			if (failed) { -				ftrace_bug(failed, p->ip); -				ftrace_free_rec(p); +			update_cnt++; + +			/* +			 * If the tracing is enabled, go ahead and enable the record. +			 * +			 * The reason not to enable the record immediatelly is the +			 * inherent check of ftrace_make_nop/ftrace_make_call for +			 * correct previous instructions.  Making first the NOP +			 * conversion puts the module to the correct state, thus +			 * passing the ftrace_make_call check. +			 */ +			if (ftrace_start_up && cnt) { +				int failed = __ftrace_replace_code(p, 1); +				if (failed) +					ftrace_bug(failed, p->ip);  			}  		}  	}  	stop = ftrace_now(raw_smp_processor_id());  	ftrace_update_time = stop - start; -	ftrace_update_tot_cnt += ftrace_update_cnt; +	ftrace_update_tot_cnt += update_cnt;  	return 0;  } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +static int ftrace_allocate_records(struct ftrace_page *pg, int count)  { -	struct ftrace_page *pg; +	int order;  	int cnt; -	int i; -	/* allocate a few pages */ -	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); -	if (!ftrace_pages_start) -		return -1; +	if (WARN_ON(!count)) +		return -EINVAL; + +	order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));  	/* -	 * Allocate a few more pages. -	 * -	 * TODO: have some parser search vmlinux before -	 *   final linking to find all calls to ftrace. -	 *   Then we can: -	 *    a) know how many pages to allocate. -	 *     and/or -	 *    b) set up the table then. -	 * -	 *  The dynamic code is still necessary for -	 *  modules. +	 * We want to fill as much as possible. No more than a page +	 * may be empty.  	 */ +	while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) +		order--; -	pg = ftrace_pages = ftrace_pages_start; + again: +	pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + +	if (!pg->records) { +		/* if we can't allocate this size, try something smaller */ +		if (!order) +			return -ENOMEM; +		order >>= 1; +		goto again; +	} -	cnt = num_to_init / ENTRIES_PER_PAGE; -	pr_info("ftrace: allocating %ld entries in %d pages\n", -		num_to_init, cnt + 1); +	cnt = (PAGE_SIZE << order) / ENTRY_SIZE; +	pg->size = cnt; -	for (i = 0; i < cnt; i++) { -		pg->next = (void *)get_zeroed_page(GFP_KERNEL); +	if (cnt > count) +		cnt = count; -		/* If we fail, we'll try later anyway */ -		if (!pg->next) +	return cnt; +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ +	struct ftrace_page *start_pg; +	struct ftrace_page *pg; +	int order; +	int cnt; + +	if (!num_to_init) +		return 0; + +	start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); +	if (!pg) +		return NULL; + +	/* +	 * Try to allocate as much as possible in one continues +	 * location that fills in all of the space. We want to +	 * waste as little space as possible. +	 */ +	for (;;) { +		cnt = ftrace_allocate_records(pg, num_to_init); +		if (cnt < 0) +			goto free_pages; + +		num_to_init -= cnt; +		if (!num_to_init)  			break; +		pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); +		if (!pg->next) +			goto free_pages; +  		pg = pg->next;  	} -	return 0; -} +	return start_pg; -enum { -	FTRACE_ITER_FILTER	= (1 << 0), -	FTRACE_ITER_NOTRACE	= (1 << 1), -	FTRACE_ITER_FAILURES	= (1 << 2), -	FTRACE_ITER_PRINTALL	= (1 << 3), -	FTRACE_ITER_HASH	= (1 << 4), -}; + free_pages: +	while (start_pg) { +		order = get_count_order(pg->size / ENTRIES_PER_PAGE); +		free_pages((unsigned long)pg->records, order); +		start_pg = pg->next; +		kfree(pg); +		pg = start_pg; +	} +	pr_info("ftrace: FAILED to allocate memory for functions\n"); +	return NULL; +}  #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1365,6 +2418,8 @@ struct ftrace_iterator {  	struct dyn_ftrace		*func;  	struct ftrace_func_probe	*probe;  	struct trace_parser		parser; +	struct ftrace_hash		*hash; +	struct ftrace_ops		*ops;  	int				hidx;  	int				idx;  	unsigned			flags; @@ -1418,6 +2473,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)  	void *p = NULL;  	loff_t l; +	if (!(iter->flags & FTRACE_ITER_DO_HASH)) +		return NULL; +  	if (iter->func_pos > *pos)  		return NULL; @@ -1461,13 +2519,17 @@ static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  {  	struct ftrace_iterator *iter = m->private; +	struct ftrace_ops *ops = iter->ops;  	struct dyn_ftrace *rec = NULL; +	if (unlikely(ftrace_disabled)) +		return NULL; +  	if (iter->flags & FTRACE_ITER_HASH)  		return t_hash_next(m, pos);  	(*pos)++; -	iter->pos = *pos; +	iter->pos = iter->func_pos = *pos;  	if (iter->flags & FTRACE_ITER_PRINTALL)  		return t_hash_start(m, pos); @@ -1481,19 +2543,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  		}  	} else {  		rec = &iter->pg->records[iter->idx++]; -		if ((rec->flags & FTRACE_FL_FREE) || +		if (((iter->flags & FTRACE_ITER_FILTER) && +		     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || -		    (!(iter->flags & FTRACE_ITER_FAILURES) && -		     (rec->flags & FTRACE_FL_FAILED)) || - -		    ((iter->flags & FTRACE_ITER_FAILURES) && -		     !(rec->flags & FTRACE_FL_FAILED)) || +		    ((iter->flags & FTRACE_ITER_NOTRACE) && +		     !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || -		    ((iter->flags & FTRACE_ITER_FILTER) && -		     !(rec->flags & FTRACE_FL_FILTER)) || +		    ((iter->flags & FTRACE_ITER_ENABLED) && +		     !(rec->flags & FTRACE_FL_ENABLED))) { -		    ((iter->flags & FTRACE_ITER_NOTRACE) && -		     !(rec->flags & FTRACE_FL_NOTRACE))) {  			rec = NULL;  			goto retry;  		} @@ -1502,7 +2560,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  	if (!rec)  		return t_hash_start(m, pos); -	iter->func_pos = *pos;  	iter->func = rec;  	return iter; @@ -1512,16 +2569,21 @@ static void reset_iter_read(struct ftrace_iterator *iter)  {  	iter->pos = 0;  	iter->func_pos = 0; -	iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); +	iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);  }  static void *t_start(struct seq_file *m, loff_t *pos)  {  	struct ftrace_iterator *iter = m->private; +	struct ftrace_ops *ops = iter->ops;  	void *p = NULL;  	loff_t l;  	mutex_lock(&ftrace_lock); + +	if (unlikely(ftrace_disabled)) +		return NULL; +  	/*  	 * If an lseek was done, then reset and start from beginning.  	 */ @@ -1533,7 +2595,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)  	 * off, we can short cut and just print out that all  	 * functions are enabled.  	 */ -	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { +	if (iter->flags & FTRACE_ITER_FILTER && +	    ftrace_hash_empty(ops->filter_hash)) {  		if (*pos > 0)  			return t_hash_start(m, pos);  		iter->flags |= FTRACE_ITER_PRINTALL; @@ -1558,12 +2621,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)  			break;  	} -	if (!p) { -		if (iter->flags & FTRACE_ITER_FILTER) -			return t_hash_start(m, pos); - -		return NULL; -	} +	if (!p) +		return t_hash_start(m, pos);  	return iter;  } @@ -1591,7 +2650,12 @@ static int t_show(struct seq_file *m, void *v)  	if (!rec)  		return 0; -	seq_printf(m, "%ps\n", (void *)rec->ip); +	seq_printf(m, "%ps", (void *)rec->ip); +	if (iter->flags & FTRACE_ITER_ENABLED) +		seq_printf(m, " (%ld)%s", +			   rec->flags & ~FTRACE_FL_MASK, +			   rec->flags & FTRACE_FL_REGS ? " R" : ""); +	seq_printf(m, "\n");  	return 0;  } @@ -1607,70 +2671,70 @@ static int  ftrace_avail_open(struct inode *inode, struct file *file)  {  	struct ftrace_iterator *iter; -	int ret;  	if (unlikely(ftrace_disabled))  		return -ENODEV; -	iter = kzalloc(sizeof(*iter), GFP_KERNEL); -	if (!iter) -		return -ENOMEM; - -	iter->pg = ftrace_pages_start; - -	ret = seq_open(file, &show_ftrace_seq_ops); -	if (!ret) { -		struct seq_file *m = file->private_data; - -		m->private = iter; -	} else { -		kfree(iter); +	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); +	if (iter) { +		iter->pg = ftrace_pages_start; +		iter->ops = &global_ops;  	} -	return ret; +	return iter ? 0 : -ENOMEM;  }  static int -ftrace_failures_open(struct inode *inode, struct file *file) +ftrace_enabled_open(struct inode *inode, struct file *file)  { -	int ret; -	struct seq_file *m;  	struct ftrace_iterator *iter; -	ret = ftrace_avail_open(inode, file); -	if (!ret) { -		m = file->private_data; -		iter = m->private; -		iter->flags = FTRACE_ITER_FAILURES; +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); +	if (iter) { +		iter->pg = ftrace_pages_start; +		iter->flags = FTRACE_ITER_ENABLED; +		iter->ops = &global_ops;  	} -	return ret; +	return iter ? 0 : -ENOMEM;  } - -static void ftrace_filter_reset(int enable) +static void ftrace_filter_reset(struct ftrace_hash *hash)  { -	struct ftrace_page *pg; -	struct dyn_ftrace *rec; -	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; -  	mutex_lock(&ftrace_lock); -	if (enable) -		ftrace_filtered = 0; -	do_for_each_ftrace_rec(pg, rec) { -		if (rec->flags & FTRACE_FL_FAILED) -			continue; -		rec->flags &= ~type; -	} while_for_each_ftrace_rec(); +	ftrace_hash_clear(hash);  	mutex_unlock(&ftrace_lock);  } -static int -ftrace_regex_open(struct inode *inode, struct file *file, int enable) +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * tracing_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int +ftrace_regex_open(struct ftrace_ops *ops, int flag, +		  struct inode *inode, struct file *file)  {  	struct ftrace_iterator *iter; +	struct ftrace_hash *hash;  	int ret = 0; +	ftrace_ops_init(ops); +  	if (unlikely(ftrace_disabled))  		return -ENODEV; @@ -1683,27 +2747,48 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)  		return -ENOMEM;  	} -	mutex_lock(&ftrace_regex_lock); +	iter->ops = ops; +	iter->flags = flag; + +	mutex_lock(&ops->regex_lock); + +	if (flag & FTRACE_ITER_NOTRACE) +		hash = ops->notrace_hash; +	else +		hash = ops->filter_hash; + +	if (file->f_mode & FMODE_WRITE) { +		iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); +		if (!iter->hash) { +			trace_parser_put(&iter->parser); +			kfree(iter); +			ret = -ENOMEM; +			goto out_unlock; +		} +	} +  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) -		ftrace_filter_reset(enable); +		ftrace_filter_reset(iter->hash);  	if (file->f_mode & FMODE_READ) {  		iter->pg = ftrace_pages_start; -		iter->flags = enable ? FTRACE_ITER_FILTER : -			FTRACE_ITER_NOTRACE;  		ret = seq_open(file, &show_ftrace_seq_ops);  		if (!ret) {  			struct seq_file *m = file->private_data;  			m->private = iter;  		} else { +			/* Failed */ +			free_ftrace_hash(iter->hash);  			trace_parser_put(&iter->parser);  			kfree(iter);  		}  	} else  		file->private_data = iter; -	mutex_unlock(&ftrace_regex_lock); + + out_unlock: +	mutex_unlock(&ops->regex_lock);  	return ret;  } @@ -1711,26 +2796,20 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)  static int  ftrace_filter_open(struct inode *inode, struct file *file)  { -	return ftrace_regex_open(inode, file, 1); +	struct ftrace_ops *ops = inode->i_private; + +	return ftrace_regex_open(ops, +			FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, +			inode, file);  }  static int  ftrace_notrace_open(struct inode *inode, struct file *file)  { -	return ftrace_regex_open(inode, file, 0); -} - -static loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) -{ -	loff_t ret; - -	if (file->f_mode & FMODE_READ) -		ret = seq_lseek(file, offset, origin); -	else -		file->f_pos = ret = 1; +	struct ftrace_ops *ops = inode->i_private; -	return ret; +	return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, +				 inode, file);  }  static int ftrace_match(char *str, char *regex, int len, int type) @@ -1762,86 +2841,98 @@ static int ftrace_match(char *str, char *regex, int len, int type)  }  static int -ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +{ +	struct ftrace_func_entry *entry; +	int ret = 0; + +	entry = ftrace_lookup_ip(hash, rec->ip); +	if (not) { +		/* Do nothing if it doesn't exist */ +		if (!entry) +			return 0; + +		free_hash_entry(hash, entry); +	} else { +		/* Do nothing if it exists */ +		if (entry) +			return 0; + +		ret = add_hash_entry(hash, rec->ip); +	} +	return ret; +} + +static int +ftrace_match_record(struct dyn_ftrace *rec, char *mod, +		    char *regex, int len, int type)  {  	char str[KSYM_SYMBOL_LEN]; +	char *modname; + +	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + +	if (mod) { +		/* module lookup requires matching the module */ +		if (!modname || strcmp(modname, mod)) +			return 0; + +		/* blank search means to match all funcs in the mod */ +		if (!len) +			return 1; +	} -	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);  	return ftrace_match(str, regex, len, type);  } -static int ftrace_match_records(char *buff, int len, int enable) +static int +match_records(struct ftrace_hash *hash, char *buff, +	      int len, char *mod, int not)  { -	unsigned int search_len; +	unsigned search_len = 0;  	struct ftrace_page *pg;  	struct dyn_ftrace *rec; -	unsigned long flag; -	char *search; -	int type; -	int not; +	int type = MATCH_FULL; +	char *search = buff;  	int found = 0; +	int ret; -	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; -	type = filter_parse_regex(buff, len, &search, ¬); - -	search_len = strlen(search); +	if (len) { +		type = filter_parse_regex(buff, len, &search, ¬); +		search_len = strlen(search); +	}  	mutex_lock(&ftrace_lock); -	do_for_each_ftrace_rec(pg, rec) { -		if (rec->flags & FTRACE_FL_FAILED) -			continue; +	if (unlikely(ftrace_disabled)) +		goto out_unlock; -		if (ftrace_match_record(rec, search, search_len, type)) { -			if (not) -				rec->flags &= ~flag; -			else -				rec->flags |= flag; +	do_for_each_ftrace_rec(pg, rec) { +		if (ftrace_match_record(rec, mod, search, search_len, type)) { +			ret = enter_record(hash, rec, not); +			if (ret < 0) { +				found = ret; +				goto out_unlock; +			}  			found = 1;  		} -		/* -		 * Only enable filtering if we have a function that -		 * is filtered on. -		 */ -		if (enable && (rec->flags & FTRACE_FL_FILTER)) -			ftrace_filtered = 1;  	} while_for_each_ftrace_rec(); + out_unlock:  	mutex_unlock(&ftrace_lock);  	return found;  }  static int -ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, -			   char *regex, int len, int type) +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)  { -	char str[KSYM_SYMBOL_LEN]; -	char *modname; - -	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - -	if (!modname || strcmp(modname, mod)) -		return 0; - -	/* blank search means to match all funcs in the mod */ -	if (len) -		return ftrace_match(str, regex, len, type); -	else -		return 1; +	return match_records(hash, buff, len, NULL, 0);  } -static int ftrace_match_module_records(char *buff, char *mod, int enable) +static int +ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)  { -	unsigned search_len = 0; -	struct ftrace_page *pg; -	struct dyn_ftrace *rec; -	int type = MATCH_FULL; -	char *search = buff; -	unsigned long flag;  	int not = 0; -	int found = 0; - -	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;  	/* blank or '*' mean the same */  	if (strcmp(buff, "*") == 0) @@ -1853,32 +2944,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)  		not = 1;  	} -	if (strlen(buff)) { -		type = filter_parse_regex(buff, strlen(buff), &search, ¬); -		search_len = strlen(search); -	} - -	mutex_lock(&ftrace_lock); -	do_for_each_ftrace_rec(pg, rec) { - -		if (rec->flags & FTRACE_FL_FAILED) -			continue; - -		if (ftrace_match_module_record(rec, mod, -					       search, search_len, type)) { -			if (not) -				rec->flags &= ~flag; -			else -				rec->flags |= flag; -			found = 1; -		} -		if (enable && (rec->flags & FTRACE_FL_FILTER)) -			ftrace_filtered = 1; - -	} while_for_each_ftrace_rec(); -	mutex_unlock(&ftrace_lock); - -	return found; +	return match_records(hash, buff, strlen(buff), mod, not);  }  /* @@ -1887,9 +2953,11 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)   */  static int -ftrace_mod_callback(char *func, char *cmd, char *param, int enable) +ftrace_mod_callback(struct ftrace_hash *hash, +		    char *func, char *cmd, char *param, int enable)  {  	char *mod; +	int ret = -EINVAL;  	/*  	 * cmd == 'mod' because we only registered this func @@ -1901,15 +2969,19 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)  	/* we must have a module name */  	if (!param) -		return -EINVAL; +		return ret;  	mod = strsep(¶m, ":");  	if (!strlen(mod)) -		return -EINVAL; +		return ret; -	if (ftrace_match_module_records(func, mod, enable)) -		return 0; -	return -EINVAL; +	ret = ftrace_match_module_records(hash, func, mod); +	if (!ret) +		ret = -EINVAL; +	if (ret < 0) +		return ret; + +	return 0;  }  static struct ftrace_func_command ftrace_mod_cmd = { @@ -1921,14 +2993,13 @@ static int __init ftrace_mod_cmd_init(void)  {  	return register_ftrace_command(&ftrace_mod_cmd);  } -device_initcall(ftrace_mod_cmd_init); +core_initcall(ftrace_mod_cmd_init); -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, +				      struct ftrace_ops *op, struct pt_regs *pt_regs)  {  	struct ftrace_func_probe *entry;  	struct hlist_head *hhd; -	struct hlist_node *n;  	unsigned long key;  	key = hash_long(ip, FTRACE_HASH_BITS); @@ -1944,7 +3015,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)  	 * on the hash. rcu_read_lock is too dangerous here.  	 */  	preempt_disable_notrace(); -	hlist_for_each_entry_rcu(entry, n, hhd, node) { +	hlist_for_each_entry_rcu_notrace(entry, hhd, node) {  		if (entry->ip == ip)  			entry->ops->func(ip, parent_ip, &entry->data);  	} @@ -1954,16 +3025,23 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)  static struct ftrace_ops trace_probe_ops __read_mostly =  {  	.func		= function_trace_probe_call, +	.flags		= FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(trace_probe_ops)  };  static int ftrace_probe_registered;  static void __enable_ftrace_function_probe(void)  { +	int ret;  	int i; -	if (ftrace_probe_registered) +	if (ftrace_probe_registered) { +		/* still need to update the function call sites */ +		if (ftrace_enabled) +			ftrace_run_update_code(FTRACE_UPDATE_CALLS);  		return; +	}  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -1974,8 +3052,8 @@ static void __enable_ftrace_function_probe(void)  	if (i == FTRACE_FUNC_HASHSIZE)  		return; -	__register_ftrace_function(&trace_probe_ops); -	ftrace_startup(0); +	ret = ftrace_startup(&trace_probe_ops, 0); +  	ftrace_probe_registered = 1;  } @@ -1993,34 +3071,33 @@ static void __disable_ftrace_function_probe(void)  	}  	/* no more funcs left */ -	__unregister_ftrace_function(&trace_probe_ops); -	ftrace_shutdown(0); +	ftrace_shutdown(&trace_probe_ops, 0); +  	ftrace_probe_registered = 0;  } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry)  { -	struct ftrace_func_probe *entry = -		container_of(rhp, struct ftrace_func_probe, rcu); -  	if (entry->ops->free) -		entry->ops->free(&entry->data); +		entry->ops->free(entry->ops, entry->ip, &entry->data);  	kfree(entry);  } -  int  register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  			      void *data)  {  	struct ftrace_func_probe *entry; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct ftrace_hash *hash;  	struct ftrace_page *pg;  	struct dyn_ftrace *rec;  	int type, len, not;  	unsigned long key;  	int count = 0;  	char *search; +	int ret;  	type = filter_parse_regex(glob, strlen(glob), &search, ¬);  	len = strlen(search); @@ -2029,13 +3106,24 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	if (WARN_ON(not))  		return -EINVAL; +	mutex_lock(&trace_probe_ops.regex_lock); + +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) { +		count = -ENOMEM; +		goto out; +	} + +	if (unlikely(ftrace_disabled)) { +		count = -ENODEV; +		goto out; +	} +  	mutex_lock(&ftrace_lock); -	do_for_each_ftrace_rec(pg, rec) { -		if (rec->flags & FTRACE_FL_FAILED) -			continue; +	do_for_each_ftrace_rec(pg, rec) { -		if (!ftrace_match_record(rec, search, len, type)) +		if (!ftrace_match_record(rec, NULL, search, len, type))  			continue;  		entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -2055,14 +3143,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		 * for each function we find. We call the callback  		 * to give the caller an opportunity to do so.  		 */ -		if (ops->callback) { -			if (ops->callback(rec->ip, &entry->data) < 0) { +		if (ops->init) { +			if (ops->init(ops, rec->ip, &entry->data) < 0) {  				/* caller does not like this func */  				kfree(entry);  				continue;  			}  		} +		ret = enter_record(hash, rec, 0); +		if (ret < 0) { +			kfree(entry); +			count = ret; +			goto out_unlock; +		} +  		entry->ops = ops;  		entry->ip = rec->ip; @@ -2070,10 +3165,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);  	} while_for_each_ftrace_rec(); + +	ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	if (ret < 0) +		count = ret; +  	__enable_ftrace_function_probe();   out_unlock:  	mutex_unlock(&ftrace_lock); + out: +	mutex_unlock(&trace_probe_ops.regex_lock); +	free_ftrace_hash(hash);  	return count;  } @@ -2087,8 +3190,13 @@ static void  __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  				  void *data, int flags)  { +	struct ftrace_func_entry *rec_entry;  	struct ftrace_func_probe *entry; -	struct hlist_node *n, *tmp; +	struct ftrace_func_probe *p; +	struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; +	struct list_head free_list; +	struct ftrace_hash *hash; +	struct hlist_node *tmp;  	char str[KSYM_SYMBOL_LEN];  	int type = MATCH_FULL;  	int i, len = 0; @@ -2107,11 +3215,19 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  			return;  	} -	mutex_lock(&ftrace_lock); +	mutex_lock(&trace_probe_ops.regex_lock); + +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) +		/* Hmm, should report this somehow */ +		goto out_unlock; + +	INIT_LIST_HEAD(&free_list); +  	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {  		struct hlist_head *hhd = &ftrace_func_hash[i]; -		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { +		hlist_for_each_entry_safe(entry, tmp, hhd, node) {  			/* break up if statements for readability */  			if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) @@ -2128,12 +3244,32 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  					continue;  			} -			hlist_del(&entry->node); -			call_rcu(&entry->rcu, ftrace_free_entry_rcu); +			rec_entry = ftrace_lookup_ip(hash, entry->ip); +			/* It is possible more than one entry had this ip */ +			if (rec_entry) +				free_hash_entry(hash, rec_entry); + +			hlist_del_rcu(&entry->node); +			list_add(&entry->free_list, &free_list);  		}  	} +	mutex_lock(&ftrace_lock);  	__disable_ftrace_function_probe(); +	/* +	 * Remove after the disable is called. Otherwise, if the last +	 * probe is removed, a null hash means *all enabled*. +	 */ +	ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); +	synchronize_sched(); +	list_for_each_entry_safe(entry, p, &free_list, free_list) { +		list_del(&entry->free_list); +		ftrace_free_entry(entry); +	}  	mutex_unlock(&ftrace_lock); +		 + out_unlock: +	mutex_unlock(&trace_probe_ops.regex_lock); +	free_ftrace_hash(hash);  }  void @@ -2158,7 +3294,11 @@ void unregister_ftrace_function_probe_all(char *glob)  static LIST_HEAD(ftrace_commands);  static DEFINE_MUTEX(ftrace_cmd_mutex); -int register_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd)  {  	struct ftrace_func_command *p;  	int ret = 0; @@ -2177,7 +3317,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd)  	return ret;  } -int unregister_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd)  {  	struct ftrace_func_command *p, *n;  	int ret = -ENODEV; @@ -2196,7 +3340,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)  	return ret;  } -static int ftrace_process_regex(char *buff, int len, int enable) +static int ftrace_process_regex(struct ftrace_hash *hash, +				char *buff, int len, int enable)  {  	char *func, *command, *next = buff;  	struct ftrace_func_command *p; @@ -2205,9 +3350,12 @@ static int ftrace_process_regex(char *buff, int len, int enable)  	func = strsep(&next, ":");  	if (!next) { -		if (ftrace_match_records(func, len, enable)) -			return 0; -		return ret; +		ret = ftrace_match_records(hash, func, len); +		if (!ret) +			ret = -EINVAL; +		if (ret < 0) +			return ret; +		return 0;  	}  	/* command found */ @@ -2217,7 +3365,7 @@ static int ftrace_process_regex(char *buff, int len, int enable)  	mutex_lock(&ftrace_cmd_mutex);  	list_for_each_entry(p, &ftrace_commands, list) {  		if (strcmp(p->name, command) == 0) { -			ret = p->func(func, command, next, enable); +			ret = p->func(hash, func, command, next, enable);  			goto out_unlock;  		}  	} @@ -2238,63 +3386,158 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  	if (!cnt)  		return 0; -	mutex_lock(&ftrace_regex_lock); -  	if (file->f_mode & FMODE_READ) {  		struct seq_file *m = file->private_data;  		iter = m->private;  	} else  		iter = file->private_data; +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	/* iter->hash is a local copy, so we don't need regex_lock */ +  	parser = &iter->parser;  	read = trace_get_user(parser, ubuf, cnt, ppos);  	if (read >= 0 && trace_parser_loaded(parser) &&  	    !trace_parser_cont(parser)) { -		ret = ftrace_process_regex(parser->buffer, +		ret = ftrace_process_regex(iter->hash, parser->buffer,  					   parser->idx, enable);  		trace_parser_clear(parser); -		if (ret) -			goto out_unlock; +		if (ret < 0) +			goto out;  	}  	ret = read; -out_unlock: -	mutex_unlock(&ftrace_regex_lock); - + out:  	return ret;  } -static ssize_t +ssize_t  ftrace_filter_write(struct file *file, const char __user *ubuf,  		    size_t cnt, loff_t *ppos)  {  	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);  } -static ssize_t +ssize_t  ftrace_notrace_write(struct file *file, const char __user *ubuf,  		     size_t cnt, loff_t *ppos)  {  	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);  } -static void -ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ +	struct ftrace_func_entry *entry; + +	if (!ftrace_location(ip)) +		return -EINVAL; + +	if (remove) { +		entry = ftrace_lookup_ip(hash, ip); +		if (!entry) +			return -ENOENT; +		free_hash_entry(hash, entry); +		return 0; +	} + +	return add_hash_entry(hash, ip); +} + +static void ftrace_ops_update_code(struct ftrace_ops *ops)  { +	if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) +		ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, +		unsigned long ip, int remove, int reset, int enable) +{ +	struct ftrace_hash **orig_hash; +	struct ftrace_hash *hash; +	int ret; +  	if (unlikely(ftrace_disabled)) -		return; +		return -ENODEV; + +	mutex_lock(&ops->regex_lock); + +	if (enable) +		orig_hash = &ops->filter_hash; +	else +		orig_hash = &ops->notrace_hash; + +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (!hash) { +		ret = -ENOMEM; +		goto out_regex_unlock; +	} -	mutex_lock(&ftrace_regex_lock);  	if (reset) -		ftrace_filter_reset(enable); -	if (buf) -		ftrace_match_records(buf, len, enable); -	mutex_unlock(&ftrace_regex_lock); +		ftrace_filter_reset(hash); +	if (buf && !ftrace_match_records(hash, buf, len)) { +		ret = -EINVAL; +		goto out_regex_unlock; +	} +	if (ip) { +		ret = ftrace_match_addr(hash, ip, remove); +		if (ret < 0) +			goto out_regex_unlock; +	} + +	mutex_lock(&ftrace_lock); +	ret = ftrace_hash_move(ops, enable, orig_hash, hash); +	if (!ret) +		ftrace_ops_update_code(ops); + +	mutex_unlock(&ftrace_lock); + + out_regex_unlock: +	mutex_unlock(&ops->regex_lock); + +	free_ftrace_hash(hash); +	return ret; +} + +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, +		int reset, int enable) +{ +	return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, +			 int remove, int reset) +{ +	ftrace_ops_init(ops); +	return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, +		 int reset, int enable) +{ +	return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);  }  /**   * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with   * @buf - the string that holds the function filter text.   * @len - the length of the string.   * @reset - non zero to reset all filters before applying this filter. @@ -2302,13 +3545,17 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)   * Filters denote which functions should be enabled when tracing is enabled.   * If @buf is NULL and reset is set, all functions will be enabled for tracing.   */ -void ftrace_set_filter(unsigned char *buf, int len, int reset) +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +		       int len, int reset)  { -	ftrace_set_regex(buf, len, reset, 1); +	ftrace_ops_init(ops); +	return ftrace_set_regex(ops, buf, len, reset, 1);  } +EXPORT_SYMBOL_GPL(ftrace_set_filter);  /**   * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with   * @buf - the string that holds the function notrace text.   * @len - the length of the string.   * @reset - non zero to reset all filters before applying this filter. @@ -2317,10 +3564,43 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)   * is enabled. If @buf is NULL and reset is set, all functions will be enabled   * for tracing.   */ -void ftrace_set_notrace(unsigned char *buf, int len, int reset) +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +			int len, int reset) +{ +	ftrace_ops_init(ops); +	return ftrace_set_regex(ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_global_filter - set a function to filter on with global tracers + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_global_filter(unsigned char *buf, int len, int reset)  { -	ftrace_set_regex(buf, len, reset, 0); +	ftrace_set_regex(&global_ops, buf, len, reset, 1);  } +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); + +/** + * ftrace_set_global_notrace - set a function to not trace with global tracers + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) +{ +	ftrace_set_regex(&global_ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);  /*   * command line interface to allow users to set filters on boot up. @@ -2329,23 +3609,28 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)  static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;  static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; +  static int __init set_ftrace_notrace(char *str)  { -	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); +	ftrace_filter_param = true; +	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_notrace=", set_ftrace_notrace);  static int __init set_ftrace_filter(char *str)  { -	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); +	ftrace_filter_param = true; +	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);  	return 1;  }  __setup("ftrace_filter=", set_ftrace_filter);  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);  static int __init set_graph_function(char *str)  { @@ -2363,7 +3648,7 @@ static void __init set_ftrace_early_graph(char *buf)  		func = strsep(&buf, ",");  		/* we allow only one expression at a time */  		ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, -				      func); +				      FTRACE_GRAPH_MAX_FUNCS, func);  		if (ret)  			printk(KERN_DEBUG "ftrace: function %s not "  					  "traceable\n", func); @@ -2371,39 +3656,42 @@ static void __init set_ftrace_early_graph(char *buf)  }  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init set_ftrace_early_filter(char *buf, int enable) +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)  {  	char *func; +	ftrace_ops_init(ops); +  	while (buf) {  		func = strsep(&buf, ","); -		ftrace_set_regex(func, strlen(func), 0, enable); +		ftrace_set_regex(ops, func, strlen(func), 0, enable);  	}  }  static void __init set_ftrace_early_filters(void)  {  	if (ftrace_filter_buf[0]) -		set_ftrace_early_filter(ftrace_filter_buf, 1); +		ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);  	if (ftrace_notrace_buf[0]) -		set_ftrace_early_filter(ftrace_notrace_buf, 0); +		ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  	if (ftrace_graph_buf[0])  		set_ftrace_early_graph(ftrace_graph_buf);  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  } -static int -ftrace_regex_release(struct inode *inode, struct file *file, int enable) +int ftrace_regex_release(struct inode *inode, struct file *file)  {  	struct seq_file *m = (struct seq_file *)file->private_data;  	struct ftrace_iterator *iter; +	struct ftrace_hash **orig_hash;  	struct trace_parser *parser; +	int filter_hash; +	int ret; -	mutex_lock(&ftrace_regex_lock);  	if (file->f_mode & FMODE_READ) {  		iter = m->private; -  		seq_release(inode, file);  	} else  		iter = file->private_data; @@ -2411,31 +3699,35 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)  	parser = &iter->parser;  	if (trace_parser_loaded(parser)) {  		parser->buffer[parser->idx] = 0; -		ftrace_match_records(parser->buffer, parser->idx, enable); +		ftrace_match_records(iter->hash, parser->buffer, parser->idx);  	} -	mutex_lock(&ftrace_lock); -	if (ftrace_start_up && ftrace_enabled) -		ftrace_run_update_code(FTRACE_ENABLE_CALLS); -	mutex_unlock(&ftrace_lock); -  	trace_parser_put(parser); -	kfree(iter); -	mutex_unlock(&ftrace_regex_lock); -	return 0; -} +	mutex_lock(&iter->ops->regex_lock); -static int -ftrace_filter_release(struct inode *inode, struct file *file) -{ -	return ftrace_regex_release(inode, file, 1); -} +	if (file->f_mode & FMODE_WRITE) { +		filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); -static int -ftrace_notrace_release(struct inode *inode, struct file *file) -{ -	return ftrace_regex_release(inode, file, 0); +		if (filter_hash) +			orig_hash = &iter->ops->filter_hash; +		else +			orig_hash = &iter->ops->notrace_hash; + +		mutex_lock(&ftrace_lock); +		ret = ftrace_hash_move(iter->ops, filter_hash, +				       orig_hash, iter->hash); +		if (!ret) +			ftrace_ops_update_code(iter->ops); + +		mutex_unlock(&ftrace_lock); +	} + +	mutex_unlock(&iter->ops->regex_lock); +	free_ftrace_hash(iter->hash); +	kfree(iter); + +	return 0;  }  static const struct file_operations ftrace_avail_fops = { @@ -2445,8 +3737,8 @@ static const struct file_operations ftrace_avail_fops = {  	.release = seq_release_private,  }; -static const struct file_operations ftrace_failures_fops = { -	.open = ftrace_failures_open, +static const struct file_operations ftrace_enabled_fops = { +	.open = ftrace_enabled_open,  	.read = seq_read,  	.llseek = seq_lseek,  	.release = seq_release_private, @@ -2456,16 +3748,16 @@ static const struct file_operations ftrace_filter_fops = {  	.open = ftrace_filter_open,  	.read = seq_read,  	.write = ftrace_filter_write, -	.llseek = ftrace_regex_lseek, -	.release = ftrace_filter_release, +	.llseek = tracing_lseek, +	.release = ftrace_regex_release,  };  static const struct file_operations ftrace_notrace_fops = {  	.open = ftrace_notrace_open,  	.read = seq_read,  	.write = ftrace_notrace_write, -	.llseek = ftrace_regex_lseek, -	.release = ftrace_notrace_release, +	.llseek = tracing_lseek, +	.release = ftrace_regex_release,  };  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -2473,15 +3765,25 @@ static const struct file_operations ftrace_notrace_fops = {  static DEFINE_MUTEX(graph_lock);  int ftrace_graph_count; -int ftrace_graph_filter_enabled; +int ftrace_graph_notrace_count;  unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { +	unsigned long *table; +	size_t size; +	int *count; +	const struct seq_operations *seq_ops; +};  static void *  __g_next(struct seq_file *m, loff_t *pos)  { -	if (*pos >= ftrace_graph_count) +	struct ftrace_graph_data *fgd = m->private; + +	if (*pos >= *fgd->count)  		return NULL; -	return &ftrace_graph_funcs[*pos]; +	return &fgd->table[*pos];  }  static void * @@ -2493,10 +3795,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos)  static void *g_start(struct seq_file *m, loff_t *pos)  { +	struct ftrace_graph_data *fgd = m->private; +  	mutex_lock(&graph_lock);  	/* Nothing, tell g_show to print all functions are enabled */ -	if (!ftrace_graph_filter_enabled && !*pos) +	if (!*fgd->count && !*pos)  		return (void *)1;  	return __g_next(m, pos); @@ -2532,38 +3836,88 @@ static const struct seq_operations ftrace_graph_seq_ops = {  };  static int -ftrace_graph_open(struct inode *inode, struct file *file) +__ftrace_graph_open(struct inode *inode, struct file *file, +		    struct ftrace_graph_data *fgd)  {  	int ret = 0; -	if (unlikely(ftrace_disabled)) -		return -ENODEV; -  	mutex_lock(&graph_lock);  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) { -		ftrace_graph_filter_enabled = 0; -		ftrace_graph_count = 0; -		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); +		*fgd->count = 0; +		memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));  	}  	mutex_unlock(&graph_lock); -	if (file->f_mode & FMODE_READ) -		ret = seq_open(file, &ftrace_graph_seq_ops); +	if (file->f_mode & FMODE_READ) { +		ret = seq_open(file, fgd->seq_ops); +		if (!ret) { +			struct seq_file *m = file->private_data; +			m->private = fgd; +		} +	} else +		file->private_data = fgd;  	return ret;  }  static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ +	struct ftrace_graph_data *fgd; + +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); +	if (fgd == NULL) +		return -ENOMEM; + +	fgd->table = ftrace_graph_funcs; +	fgd->size = FTRACE_GRAPH_MAX_FUNCS; +	fgd->count = &ftrace_graph_count; +	fgd->seq_ops = &ftrace_graph_seq_ops; + +	return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ +	struct ftrace_graph_data *fgd; + +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); +	if (fgd == NULL) +		return -ENOMEM; + +	fgd->table = ftrace_graph_notrace_funcs; +	fgd->size = FTRACE_GRAPH_MAX_FUNCS; +	fgd->count = &ftrace_graph_notrace_count; +	fgd->seq_ops = &ftrace_graph_seq_ops; + +	return __ftrace_graph_open(inode, file, fgd); +} + +static int  ftrace_graph_release(struct inode *inode, struct file *file)  { -	if (file->f_mode & FMODE_READ) +	if (file->f_mode & FMODE_READ) { +		struct seq_file *m = file->private_data; + +		kfree(m->private);  		seq_release(inode, file); +	} else { +		kfree(file->private_data); +	} +  	return 0;  }  static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)  {  	struct dyn_ftrace *rec;  	struct ftrace_page *pg; @@ -2574,23 +3928,23 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)  	bool exists;  	int i; -	if (ftrace_disabled) -		return -ENODEV; -  	/* decode regex */  	type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); -	if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) +	if (!not && *idx >= size)  		return -EBUSY;  	search_len = strlen(search);  	mutex_lock(&ftrace_lock); -	do_for_each_ftrace_rec(pg, rec) { -		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) -			continue; +	if (unlikely(ftrace_disabled)) { +		mutex_unlock(&ftrace_lock); +		return -ENODEV; +	} + +	do_for_each_ftrace_rec(pg, rec) { -		if (ftrace_match_record(rec, search, search_len, type)) { +		if (ftrace_match_record(rec, NULL, search, search_len, type)) {  			/* if it is in the array */  			exists = false;  			for (i = 0; i < *idx; i++) { @@ -2604,7 +3958,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)  				fail = 0;  				if (!exists) {  					array[(*idx)++] = rec->ip; -					if (*idx >= FTRACE_GRAPH_MAX_FUNCS) +					if (*idx >= size)  						goto out;  				}  			} else { @@ -2622,7 +3976,6 @@ out:  	if (fail)  		return -EINVAL; -	ftrace_graph_filter_enabled = 1;  	return 0;  } @@ -2631,36 +3984,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  {  	struct trace_parser parser; -	ssize_t read, ret; +	ssize_t read, ret = 0; +	struct ftrace_graph_data *fgd = file->private_data;  	if (!cnt)  		return 0; -	mutex_lock(&graph_lock); - -	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { -		ret = -ENOMEM; -		goto out_unlock; -	} +	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) +		return -ENOMEM;  	read = trace_get_user(&parser, ubuf, cnt, ppos);  	if (read >= 0 && trace_parser_loaded((&parser))) {  		parser.buffer[parser.idx] = 0; +		mutex_lock(&graph_lock); +  		/* we allow only one expression at a time */ -		ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, -					parser.buffer); -		if (ret) -			goto out_free; +		ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, +				      parser.buffer); + +		mutex_unlock(&graph_lock);  	} -	ret = read; +	if (!ret) +		ret = read; -out_free:  	trace_parser_put(&parser); -out_unlock: -	mutex_unlock(&graph_lock);  	return ret;  } @@ -2669,45 +4019,146 @@ static const struct file_operations ftrace_graph_fops = {  	.open		= ftrace_graph_open,  	.read		= seq_read,  	.write		= ftrace_graph_write, +	.llseek		= tracing_lseek, +	.release	= ftrace_graph_release, +}; + +static const struct file_operations ftrace_graph_notrace_fops = { +	.open		= ftrace_graph_notrace_open, +	.read		= seq_read, +	.write		= ftrace_graph_write, +	.llseek		= tracing_lseek,  	.release	= ftrace_graph_release, -	.llseek		= seq_lseek,  };  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +void ftrace_create_filter_files(struct ftrace_ops *ops, +				struct dentry *parent) +{ + +	trace_create_file("set_ftrace_filter", 0644, parent, +			  ops, &ftrace_filter_fops); + +	trace_create_file("set_ftrace_notrace", 0644, parent, +			  ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actualy delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ +	mutex_lock(&ftrace_lock); +	if (ops->flags & FTRACE_OPS_FL_ENABLED) +		ftrace_shutdown(ops, 0); +	ops->flags |= FTRACE_OPS_FL_DELETED; +	mutex_unlock(&ftrace_lock); +} +  static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)  {  	trace_create_file("available_filter_functions", 0444,  			d_tracer, NULL, &ftrace_avail_fops); -	trace_create_file("failures", 0444, -			d_tracer, NULL, &ftrace_failures_fops); - -	trace_create_file("set_ftrace_filter", 0644, d_tracer, -			NULL, &ftrace_filter_fops); +	trace_create_file("enabled_functions", 0444, +			d_tracer, NULL, &ftrace_enabled_fops); -	trace_create_file("set_ftrace_notrace", 0644, d_tracer, -				    NULL, &ftrace_notrace_fops); +	ftrace_create_filter_files(&global_ops, d_tracer);  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  	trace_create_file("set_graph_function", 0444, d_tracer,  				    NULL,  				    &ftrace_graph_fops); +	trace_create_file("set_graph_notrace", 0444, d_tracer, +				    NULL, +				    &ftrace_graph_notrace_fops);  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  	return 0;  } +static int ftrace_cmp_ips(const void *a, const void *b) +{ +	const unsigned long *ipa = a; +	const unsigned long *ipb = b; + +	if (*ipa > *ipb) +		return 1; +	if (*ipa < *ipb) +		return -1; +	return 0; +} + +static void ftrace_swap_ips(void *a, void *b, int size) +{ +	unsigned long *ipa = a; +	unsigned long *ipb = b; +	unsigned long t; + +	t = *ipa; +	*ipa = *ipb; +	*ipb = t; +} +  static int ftrace_process_locs(struct module *mod,  			       unsigned long *start,  			       unsigned long *end)  { +	struct ftrace_page *start_pg; +	struct ftrace_page *pg; +	struct dyn_ftrace *rec; +	unsigned long count;  	unsigned long *p;  	unsigned long addr; -	unsigned long flags; +	unsigned long flags = 0; /* Shut up gcc */ +	int ret = -ENOMEM; + +	count = end - start; + +	if (!count) +		return 0; + +	sort(start, count, sizeof(*start), +	     ftrace_cmp_ips, ftrace_swap_ips); + +	start_pg = ftrace_allocate_pages(count); +	if (!start_pg) +		return -ENOMEM;  	mutex_lock(&ftrace_lock); + +	/* +	 * Core and each module needs their own pages, as +	 * modules will free them when they are removed. +	 * Force a new page to be allocated for modules. +	 */ +	if (!mod) { +		WARN_ON(ftrace_pages || ftrace_pages_start); +		/* First initialization */ +		ftrace_pages = ftrace_pages_start = start_pg; +	} else { +		if (!ftrace_pages) +			goto out; + +		if (WARN_ON(ftrace_pages->next)) { +			/* Hmm, we have free pages? */ +			while (ftrace_pages->next) +				ftrace_pages = ftrace_pages->next; +		} + +		ftrace_pages->next = start_pg; +	} +  	p = start; +	pg = start_pg;  	while (p < end) {  		addr = ftrace_call_adjust(*p++);  		/* @@ -2718,38 +4169,87 @@ static int ftrace_process_locs(struct module *mod,  		 */  		if (!addr)  			continue; -		ftrace_record_ip(addr); + +		if (pg->index == pg->size) { +			/* We should have allocated enough */ +			if (WARN_ON(!pg->next)) +				break; +			pg = pg->next; +		} + +		rec = &pg->records[pg->index++]; +		rec->ip = addr;  	} -	/* disable interrupts to prevent kstop machine */ -	local_irq_save(flags); -	ftrace_update_code(mod); -	local_irq_restore(flags); +	/* We should have used all pages */ +	WARN_ON(pg->next); + +	/* Assign the last page to ftrace_pages */ +	ftrace_pages = pg; + +	/* +	 * We only need to disable interrupts on start up +	 * because we are modifying code that an interrupt +	 * may execute, and the modification is not atomic. +	 * But for modules, nothing runs the code we modify +	 * until we are finished with it, and there's no +	 * reason to cause large interrupt latencies while we do it. +	 */ +	if (!mod) +		local_irq_save(flags); +	ftrace_update_code(mod, start_pg); +	if (!mod) +		local_irq_restore(flags); +	ret = 0; + out:  	mutex_unlock(&ftrace_lock); -	return 0; +	return ret;  }  #ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +  void ftrace_release_mod(struct module *mod)  {  	struct dyn_ftrace *rec; +	struct ftrace_page **last_pg;  	struct ftrace_page *pg; +	int order; + +	mutex_lock(&ftrace_lock);  	if (ftrace_disabled) -		return; +		goto out_unlock; -	mutex_lock(&ftrace_lock); -	do_for_each_ftrace_rec(pg, rec) { +	/* +	 * Each module has its own ftrace_pages, remove +	 * them from the list. +	 */ +	last_pg = &ftrace_pages_start; +	for (pg = ftrace_pages_start; pg; pg = *last_pg) { +		rec = &pg->records[0];  		if (within_module_core(rec->ip, mod)) {  			/* -			 * rec->ip is changed in ftrace_free_rec() -			 * It should not between s and e if record was freed. +			 * As core pages are first, the first +			 * page should never be a module page.  			 */ -			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); -			ftrace_free_rec(rec); -		} -	} while_for_each_ftrace_rec(); +			if (WARN_ON(pg == ftrace_pages_start)) +				goto out_unlock; + +			/* Check if we are deleting the last page */ +			if (pg == ftrace_pages) +				ftrace_pages = next_to_ftrace_page(last_pg); + +			*last_pg = pg->next; +			order = get_count_order(pg->size / ENTRIES_PER_PAGE); +			free_pages((unsigned long)pg->records, order); +			kfree(pg); +		} else +			last_pg = &pg->next; +	} + out_unlock:  	mutex_unlock(&ftrace_lock);  } @@ -2761,61 +4261,57 @@ static void ftrace_init_module(struct module *mod,  	ftrace_process_locs(mod, start, end);  } -static int ftrace_module_notify(struct notifier_block *self, -				unsigned long val, void *data) +void ftrace_module_init(struct module *mod) +{ +	ftrace_init_module(mod, mod->ftrace_callsites, +			   mod->ftrace_callsites + +			   mod->num_ftrace_callsites); +} + +static int ftrace_module_notify_exit(struct notifier_block *self, +				     unsigned long val, void *data)  {  	struct module *mod = data; -	switch (val) { -	case MODULE_STATE_COMING: -		ftrace_init_module(mod, mod->ftrace_callsites, -				   mod->ftrace_callsites + -				   mod->num_ftrace_callsites); -		break; -	case MODULE_STATE_GOING: +	if (val == MODULE_STATE_GOING)  		ftrace_release_mod(mod); -		break; -	}  	return 0;  }  #else -static int ftrace_module_notify(struct notifier_block *self, -				unsigned long val, void *data) +static int ftrace_module_notify_exit(struct notifier_block *self, +				     unsigned long val, void *data)  {  	return 0;  }  #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { -	.notifier_call = ftrace_module_notify, -	.priority = 0, +struct notifier_block ftrace_module_exit_nb = { +	.notifier_call = ftrace_module_notify_exit, +	.priority = INT_MIN,	/* Run after anything that can remove kprobes */  }; -extern unsigned long __start_mcount_loc[]; -extern unsigned long __stop_mcount_loc[]; -  void __init ftrace_init(void)  { -	unsigned long count, addr, flags; +	extern unsigned long __start_mcount_loc[]; +	extern unsigned long __stop_mcount_loc[]; +	unsigned long count, flags;  	int ret; -	/* Keep the ftrace pointer to the stub */ -	addr = (unsigned long)ftrace_stub; -  	local_irq_save(flags); -	ftrace_dyn_arch_init(&addr); +	ret = ftrace_dyn_arch_init();  	local_irq_restore(flags); - -	/* ftrace_dyn_arch_init places the return code in addr */ -	if (addr) +	if (ret)  		goto failed;  	count = __stop_mcount_loc - __start_mcount_loc; - -	ret = ftrace_dyn_table_alloc(count); -	if (ret) +	if (!count) { +		pr_info("ftrace: No functions to be traced?\n");  		goto failed; +	} + +	pr_info("ftrace: allocating %ld entries in %ld pages\n", +		count, count / ENTRIES_PER_PAGE + 1);  	last_ftrace_enabled = ftrace_enabled = 1; @@ -2823,9 +4319,9 @@ void __init ftrace_init(void)  				  __start_mcount_loc,  				  __stop_mcount_loc); -	ret = register_module_notifier(&ftrace_module_nb); +	ret = register_module_notifier(&ftrace_module_exit_nb);  	if (ret) -		pr_warning("Failed to register trace ftrace module notifier\n"); +		pr_warning("Failed to register trace ftrace module exit notifier\n");  	set_ftrace_early_filters(); @@ -2836,22 +4332,174 @@ void __init ftrace_init(void)  #else +static struct ftrace_ops global_ops = { +	.func			= ftrace_stub, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(global_ops) +}; +  static int __init ftrace_nodyn_init(void)  {  	ftrace_enabled = 1;  	return 0;  } -device_initcall(ftrace_nodyn_init); +core_initcall(ftrace_nodyn_init);  static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }  static inline void ftrace_startup_enable(int command) { }  /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(command)	do { } while (0) -# define ftrace_shutdown(command)	do { } while (0) +# define ftrace_startup(ops, command)					\ +	({								\ +		int ___ret = __register_ftrace_function(ops);		\ +		if (!___ret)						\ +			(ops)->flags |= FTRACE_OPS_FL_ENABLED;		\ +		___ret;							\ +	}) +# define ftrace_shutdown(ops, command)					\ +	({								\ +		int ___ret = __unregister_ftrace_function(ops);		\ +		if (!___ret)						\ +			(ops)->flags &= ~FTRACE_OPS_FL_ENABLED;		\ +		___ret;							\ +	}) +  # define ftrace_startup_sysctl()	do { } while (0)  # define ftrace_shutdown_sysctl()	do { } while (0) + +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ +	return 1; +} +  #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ +	tr->ops = &global_ops; +	tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ +	/* If we filter on pids, update to use the pid function */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { +		if (WARN_ON(tr->ops->func != ftrace_stub)) +			printk("ftrace ops had %pS for function\n", +			       tr->ops->func); +		/* Only the top level instance does pid tracing */ +		if (!list_empty(&ftrace_pids)) { +			set_ftrace_pid_function(func); +			func = ftrace_pid_func; +		} +	} +	tr->ops->func = func; +	tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ +	tr->ops->func = ftrace_stub; +} + +static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, +			struct ftrace_ops *op, struct pt_regs *regs) +{ +	if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) +		return; + +	/* +	 * Some of the ops may be dynamically allocated, +	 * they must be freed after a synchronize_sched(). +	 */ +	preempt_disable_notrace(); +	trace_recursion_set(TRACE_CONTROL_BIT); + +	/* +	 * Control funcs (perf) uses RCU. Only trace if +	 * RCU is currently active. +	 */ +	if (!rcu_is_watching()) +		goto out; + +	do_for_each_ftrace_op(op, ftrace_control_list) { +		if (!(op->flags & FTRACE_OPS_FL_STUB) && +		    !ftrace_function_local_disabled(op) && +		    ftrace_ops_test(op, ip, regs)) +			op->func(ip, parent_ip, op, regs); +	} while_for_each_ftrace_op(op); + out: +	trace_recursion_clear(TRACE_CONTROL_BIT); +	preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { +	.func	= ftrace_ops_control_func, +	.flags	= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, +	INIT_REGEX_LOCK(control_ops) +}; + +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, +		       struct ftrace_ops *ignored, struct pt_regs *regs) +{ +	struct ftrace_ops *op; +	int bit; + +	if (function_trace_stop) +		return; + +	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); +	if (bit < 0) +		return; + +	/* +	 * Some of the ops may be dynamically allocated, +	 * they must be freed after a synchronize_sched(). +	 */ +	preempt_disable_notrace(); +	do_for_each_ftrace_op(op, ftrace_ops_list) { +		if (ftrace_ops_test(op, ip, regs)) { +			if (WARN_ON(!op->func)) { +				function_trace_stop = 1; +				printk("op=%p %pS\n", op, op); +				goto out; +			} +			op->func(ip, parent_ip, op, regs); +		} +	} while_for_each_ftrace_op(op); +out: +	preempt_enable_notrace(); +	trace_clear_recursion(bit); +} + +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, +				 struct ftrace_ops *op, struct pt_regs *regs) +{ +	__ftrace_ops_list_func(ip, parent_ip, NULL, regs); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ +	__ftrace_ops_list_func(ip, parent_ip, NULL, NULL); +} +#endif +  static void clear_ftrace_swapper(void)  {  	struct task_struct *p; @@ -3072,7 +4720,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,  	if (strlen(tmp) == 0)  		return 1; -	ret = strict_strtol(tmp, 10, &val); +	ret = kstrtol(tmp, 10, &val);  	if (ret < 0)  		return ret; @@ -3094,7 +4742,7 @@ static const struct file_operations ftrace_pid_fops = {  	.open		= ftrace_pid_open,  	.write		= ftrace_pid_write,  	.read		= seq_read, -	.llseek		= seq_lseek, +	.llseek		= tracing_lseek,  	.release	= ftrace_pid_release,  }; @@ -3132,6 +4780,14 @@ void ftrace_kill(void)  }  /** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ +	return ftrace_disabled; +} + +/**   * register_ftrace_function - register a function for profiling   * @ops - ops structure that holds the function for profiling.   * @@ -3144,19 +4800,19 @@ void ftrace_kill(void)   */  int register_ftrace_function(struct ftrace_ops *ops)  { -	int ret; +	int ret = -1; -	if (unlikely(ftrace_disabled)) -		return -1; +	ftrace_ops_init(ops);  	mutex_lock(&ftrace_lock); -	ret = __register_ftrace_function(ops); -	ftrace_startup(0); +	ret = ftrace_startup(ops, 0);  	mutex_unlock(&ftrace_lock); +  	return ret;  } +EXPORT_SYMBOL_GPL(register_ftrace_function);  /**   * unregister_ftrace_function - unregister a function for profiling. @@ -3169,26 +4825,26 @@ int unregister_ftrace_function(struct ftrace_ops *ops)  	int ret;  	mutex_lock(&ftrace_lock); -	ret = __unregister_ftrace_function(ops); -	ftrace_shutdown(0); +	ret = ftrace_shutdown(ops, 0);  	mutex_unlock(&ftrace_lock);  	return ret;  } +EXPORT_SYMBOL_GPL(unregister_ftrace_function);  int  ftrace_enable_sysctl(struct ctl_table *table, int write,  		     void __user *buffer, size_t *lenp,  		     loff_t *ppos)  { -	int ret; - -	if (unlikely(ftrace_disabled)) -		return -ENODEV; +	int ret = -ENODEV;  	mutex_lock(&ftrace_lock); -	ret  = proc_dointvec(table, write, buffer, lenp, ppos); +	if (unlikely(ftrace_disabled)) +		goto out; + +	ret = proc_dointvec(table, write, buffer, lenp, ppos);  	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))  		goto out; @@ -3200,12 +4856,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  		ftrace_startup_sysctl();  		/* we are starting ftrace again */ -		if (ftrace_list != &ftrace_list_end) { -			if (ftrace_list->next == &ftrace_list_end) -				ftrace_trace_function = ftrace_list->func; -			else -				ftrace_trace_function = ftrace_list_func; -		} +		if (ftrace_ops_list != &ftrace_list_end) +			update_ftrace_function();  	} else {  		/* stopping ftrace calls (just send to ftrace_stub) */ @@ -3222,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier;  int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)  { @@ -3233,6 +4884,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)  trace_func_graph_ret_t ftrace_graph_return =  			(trace_func_graph_ret_t)ftrace_stub;  trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;  /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */  static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -3328,7 +4980,7 @@ static int start_graph_tracing(void)  	/* The cpu_boot init_task->ret_stack will never be freed */  	for_each_online_cpu(cpu) {  		if (!idle_task(cpu)->ret_stack) -			ftrace_graph_init_task(idle_task(cpu)); +			ftrace_graph_init_idle_task(idle_task(cpu), cpu);  	}  	do { @@ -3367,6 +5019,34 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,  	return NOTIFY_DONE;  } +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ +	if (!ftrace_ops_test(&global_ops, trace->func, NULL)) +		return 0; +	return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ +	if (ftrace_ops_list == &ftrace_list_end || +	    (ftrace_ops_list == &global_ops && +	     global_ops.next == &ftrace_list_end)) +		ftrace_graph_entry = __ftrace_graph_entry; +	else +		ftrace_graph_entry = ftrace_graph_entry_test; +} + +static struct notifier_block ftrace_suspend_notifier = { +	.notifier_call = ftrace_suspend_notifier_call, +}; +  int register_ftrace_graph(trace_func_graph_ret_t retfunc,  			trace_func_graph_ent_t entryfunc)  { @@ -3380,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  		goto out;  	} -	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;  	register_pm_notifier(&ftrace_suspend_notifier);  	ftrace_graph_active++; @@ -3391,9 +5070,21 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  	}  	ftrace_graph_return = retfunc; -	ftrace_graph_entry = entryfunc; -	ftrace_startup(FTRACE_START_FUNC_RET); +	/* +	 * Update the indirect function to the entryfunc, and the +	 * function that gets called to the entry_test first. Then +	 * call the update fgraph entry function to determine if +	 * the entryfunc should be called directly or not. +	 */ +	__ftrace_graph_entry = entryfunc; +	ftrace_graph_entry = ftrace_graph_entry_test; +	update_function_graph_func(); + +	/* Function graph doesn't use the .func field of global_ops */ +	global_ops.flags |= FTRACE_OPS_FL_STUB; + +	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);  out:  	mutex_unlock(&ftrace_lock); @@ -3410,7 +5101,9 @@ void unregister_ftrace_graph(void)  	ftrace_graph_active--;  	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;  	ftrace_graph_entry = ftrace_graph_entry_stub; -	ftrace_shutdown(FTRACE_STOP_FUNC_RET); +	__ftrace_graph_entry = ftrace_graph_entry_stub; +	ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); +	global_ops.flags &= ~FTRACE_OPS_FL_STUB;  	unregister_pm_notifier(&ftrace_suspend_notifier);  	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); @@ -3418,6 +5111,49 @@ void unregister_ftrace_graph(void)  	mutex_unlock(&ftrace_lock);  } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ +	atomic_set(&t->tracing_graph_pause, 0); +	atomic_set(&t->trace_overrun, 0); +	t->ftrace_timestamp = 0; +	/* make curr_ret_stack visible before we add the ret_stack */ +	smp_wmb(); +	t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ +	t->curr_ret_stack = -1; +	/* +	 * The idle task has no parent, it either has its own +	 * stack or no stack at all. +	 */ +	if (t->ret_stack) +		WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + +	if (ftrace_graph_active) { +		struct ftrace_ret_stack *ret_stack; + +		ret_stack = per_cpu(idle_ret_stack, cpu); +		if (!ret_stack) { +			ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH +					    * sizeof(struct ftrace_ret_stack), +					    GFP_KERNEL); +			if (!ret_stack) +				return; +			per_cpu(idle_ret_stack, cpu) = ret_stack; +		} +		graph_init_task(t, ret_stack); +	} +} +  /* Allocate a return stack for newly created task */  void ftrace_graph_init_task(struct task_struct *t)  { @@ -3433,12 +5169,7 @@ void ftrace_graph_init_task(struct task_struct *t)  				GFP_KERNEL);  		if (!ret_stack)  			return; -		atomic_set(&t->tracing_graph_pause, 0); -		atomic_set(&t->trace_overrun, 0); -		t->ftrace_timestamp = 0; -		/* make curr_ret_stack visable before we add the ret_stack */ -		smp_wmb(); -		t->ret_stack = ret_stack; +		graph_init_task(t, ret_stack);  	}  } diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a0616..1c71382b283 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,5 @@  #define CREATE_TRACE_POINTS  #include <trace/events/power.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ed509a015d..ff7027199a9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,17 +3,21 @@   *   * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>   */ +#include <linux/ftrace_event.h>  #include <linux/ring_buffer.h>  #include <linux/trace_clock.h> -#include <linux/ftrace_irq.h> +#include <linux/trace_seq.h>  #include <linux/spinlock.h> +#include <linux/irq_work.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/hardirq.h> +#include <linux/kthread.h>	/* for self test */  #include <linux/kmemcheck.h>  #include <linux/module.h>  #include <linux/percpu.h>  #include <linux/mutex.h> +#include <linux/delay.h>  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/hash.h> @@ -22,7 +26,8 @@  #include <linux/fs.h>  #include <asm/local.h> -#include "trace.h" + +static void update_pages_handler(struct work_struct *work);  /*   * The ring buffer header is special. We must manually up keep it. @@ -31,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)  {  	int ret; -	ret = trace_seq_printf(s, "# compressed entry header\n"); -	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n"); -	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n"); -	ret = trace_seq_printf(s, "\tarray       :   32 bits\n"); -	ret = trace_seq_printf(s, "\n"); +	ret = trace_seq_puts(s, "# compressed entry header\n"); +	ret = trace_seq_puts(s, "\ttype_len    :    5 bits\n"); +	ret = trace_seq_puts(s, "\ttime_delta  :   27 bits\n"); +	ret = trace_seq_puts(s, "\tarray       :   32 bits\n"); +	ret = trace_seq_putc(s, '\n');  	ret = trace_seq_printf(s, "\tpadding     : type == %d\n",  			       RINGBUF_TYPE_PADDING);  	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", @@ -155,33 +160,10 @@ enum {  static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF		(1 << 20) -/** - * tracing_on - enable all tracing buffers - * - * This function enables all tracing buffers that may have been - * disabled with tracing_off. - */ -void tracing_on(void) -{ -	set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_on); - -/** - * tracing_off - turn off all tracing buffers - * - * This function stops all tracing buffers from recording data. - * It does not disable any overhead the tracers themselves may - * be causing. This function simply causes all recording to - * the ring buffers to fail. - */ -void tracing_off(void) -{ -	clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_off); +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)  /**   * tracing_off_permanent - permanently disable ring buffers @@ -194,21 +176,12 @@ void tracing_off_permanent(void)  	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);  } -/** - * tracing_is_on - show state of ring buffers enabled - */ -int tracing_is_on(void) -{ -	return ring_buffer_flags == RB_BUFFERS_ON; -} -EXPORT_SYMBOL_GPL(tracing_is_on); -  #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))  #define RB_ALIGNMENT		4U  #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)  #define RB_EVNT_MIN_SIZE	8U	/* two 32bit words */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS  # define RB_FORCE_8BYTE_ALIGNMENT	0  # define RB_ARCH_ALIGNMENT		RB_ALIGNMENT  #else @@ -216,6 +189,8 @@ EXPORT_SYMBOL_GPL(tracing_is_on);  # define RB_ARCH_ALIGNMENT		8U  #endif +#define RB_ALIGN_DATA		__aligned(RB_ARCH_ALIGNMENT) +  /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */  #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -364,7 +339,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);  struct buffer_data_page {  	u64		 time_stamp;	/* page time stamp */  	local_t		 commit;	/* write committed index */ -	unsigned char	 data[];	/* data of buffer page */ +	unsigned char	 data[] RB_ALIGN_DATA;	/* data of buffer page */  };  /* @@ -472,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)  	return ret;  } +struct rb_irq_work { +	struct irq_work			work; +	wait_queue_head_t		waiters; +	bool				waiters_pending; +}; +  /*   * head_page == tail_page && head == tail then buffer is empty.   */ @@ -479,9 +460,10 @@ struct ring_buffer_per_cpu {  	int				cpu;  	atomic_t			record_disabled;  	struct ring_buffer		*buffer; -	spinlock_t			reader_lock;	/* serialize readers */ +	raw_spinlock_t			reader_lock;	/* serialize readers */  	arch_spinlock_t			lock;  	struct lock_class_key		lock_key; +	unsigned int			nr_pages;  	struct list_head		*pages;  	struct buffer_page		*head_page;	/* read from head */  	struct buffer_page		*tail_page;	/* write to tail */ @@ -489,21 +471,31 @@ struct ring_buffer_per_cpu {  	struct buffer_page		*reader_page;  	unsigned long			lost_events;  	unsigned long			last_overrun; -	local_t				commit_overrun; -	local_t				overrun; +	local_t				entries_bytes;  	local_t				entries; +	local_t				overrun; +	local_t				commit_overrun; +	local_t				dropped_events;  	local_t				committing;  	local_t				commits;  	unsigned long			read; +	unsigned long			read_bytes;  	u64				write_stamp;  	u64				read_stamp; +	/* ring buffer pages to update, > 0 to add, < 0 to remove */ +	int				nr_pages_to_update; +	struct list_head		new_pages; /* new pages to add */ +	struct work_struct		update_pages_work; +	struct completion		update_done; + +	struct rb_irq_work		irq_work;  };  struct ring_buffer { -	unsigned			pages;  	unsigned			flags;  	int				cpus;  	atomic_t			record_disabled; +	atomic_t			resize_disabled;  	cpumask_var_t			cpumask;  	struct lock_class_key		*reader_lock_key; @@ -516,6 +508,8 @@ struct ring_buffer {  	struct notifier_block		cpu_notify;  #endif  	u64				(*clock)(void); + +	struct rb_irq_work		irq_work;  };  struct ring_buffer_iter { @@ -527,6 +521,120 @@ struct ring_buffer_iter {  	u64				read_stamp;  }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ +	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + +	wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	DEFINE_WAIT(wait); +	struct rb_irq_work *work; + +	/* +	 * Depending on what the caller is waiting for, either any +	 * data in any cpu buffer, or a specific buffer, put the +	 * caller on the appropriate wait queue. +	 */ +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		if (!cpumask_test_cpu(cpu, buffer->cpumask)) +			return -ENODEV; +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + + +	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + +	/* +	 * The events can happen in critical sections where +	 * checking a work queue can cause deadlocks. +	 * After adding a task to the queue, this flag is set +	 * only to notify events to try to wake up the queue +	 * using irq_work. +	 * +	 * We don't clear it even if the buffer is no longer +	 * empty. The flag only causes the next event to run +	 * irq_work to do the work queue wake up. The worse +	 * that can happen if we race with !trace_empty() is that +	 * an event will cause an irq_work to try to wake up +	 * an empty queue. +	 * +	 * There's no reason to protect this flag either, as +	 * the work queue and irq_work logic will do the necessary +	 * synchronization for the wake ups. The only thing +	 * that is necessary is that the wake up happens after +	 * a task has been queued. It's OK for spurious wake ups. +	 */ +	work->waiters_pending = true; + +	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) +		schedule(); + +	finish_wait(&work->waiters, &wait); +	return 0; +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +			  struct file *filp, poll_table *poll_table) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	struct rb_irq_work *work; + +	if (cpu == RING_BUFFER_ALL_CPUS) +		work = &buffer->irq_work; +	else { +		if (!cpumask_test_cpu(cpu, buffer->cpumask)) +			return -EINVAL; + +		cpu_buffer = buffer->buffers[cpu]; +		work = &cpu_buffer->irq_work; +	} + +	work->waiters_pending = true; +	poll_wait(filp, &work->waiters, poll_table); + +	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || +	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) +		return POLLIN | POLLRDNORM; +	return 0; +} +  /* buffer may be either ring_buffer or ring_buffer_per_cpu */  #define RB_WARN_ON(b, cond)						\  	({								\ @@ -669,7 +777,7 @@ static struct list_head *rb_list_head(struct list_head *list)   * the reader page). But if the next page is a header page,   * its flags will be non zero.   */ -static int inline +static inline int  rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,  		struct buffer_page *page, struct list_head *list)  { @@ -957,7 +1065,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,  }  /** - * check_pages - integrity check of buffer pages + * rb_check_pages - integrity check of buffer pages   * @cpu_buffer: CPU buffer with pages to test   *   * As a safety measure we check to make sure the data pages have not @@ -968,6 +1076,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)  	struct list_head *head = cpu_buffer->pages;  	struct buffer_page *bpage, *tmp; +	/* Reset the head page if it exists */ +	if (cpu_buffer->head_page) +		rb_set_head_page(cpu_buffer); +  	rb_head_page_deactivate(cpu_buffer);  	if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -994,33 +1106,55 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)  	return 0;  } -static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, -			     unsigned nr_pages) +static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)  { +	int i;  	struct buffer_page *bpage, *tmp; -	unsigned long addr; -	LIST_HEAD(pages); -	unsigned i; - -	WARN_ON(!nr_pages);  	for (i = 0; i < nr_pages; i++) { +		struct page *page; +		/* +		 * __GFP_NORETRY flag makes sure that the allocation fails +		 * gracefully without invoking oom-killer and the system is +		 * not destabilized. +		 */  		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), -				    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); +				    GFP_KERNEL | __GFP_NORETRY, +				    cpu_to_node(cpu));  		if (!bpage)  			goto free_pages; -		rb_check_bpage(cpu_buffer, bpage); - -		list_add(&bpage->list, &pages); +		list_add(&bpage->list, pages); -		addr = __get_free_page(GFP_KERNEL); -		if (!addr) +		page = alloc_pages_node(cpu_to_node(cpu), +					GFP_KERNEL | __GFP_NORETRY, 0); +		if (!page)  			goto free_pages; -		bpage->page = (void *)addr; +		bpage->page = page_address(page);  		rb_init_page(bpage->page);  	} +	return 0; + +free_pages: +	list_for_each_entry_safe(bpage, tmp, pages, list) { +		list_del_init(&bpage->list); +		free_buffer_page(bpage); +	} + +	return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, +			     unsigned nr_pages) +{ +	LIST_HEAD(pages); + +	WARN_ON(!nr_pages); + +	if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) +		return -ENOMEM; +  	/*  	 * The ring buffer page list is a circular list that does not  	 * start and end with a list head. All page list items point to @@ -1029,24 +1163,19 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,  	cpu_buffer->pages = pages.next;  	list_del(&pages); +	cpu_buffer->nr_pages = nr_pages; +  	rb_check_pages(cpu_buffer);  	return 0; - - free_pages: -	list_for_each_entry_safe(bpage, tmp, &pages, list) { -		list_del_init(&bpage->list); -		free_buffer_page(bpage); -	} -	return -ENOMEM;  }  static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)  {  	struct ring_buffer_per_cpu *cpu_buffer;  	struct buffer_page *bpage; -	unsigned long addr; +	struct page *page;  	int ret;  	cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), @@ -1056,9 +1185,13 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)  	cpu_buffer->cpu = cpu;  	cpu_buffer->buffer = buffer; -	spin_lock_init(&cpu_buffer->reader_lock); +	raw_spin_lock_init(&cpu_buffer->reader_lock);  	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);  	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); +	init_completion(&cpu_buffer->update_done); +	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&cpu_buffer->irq_work.waiters);  	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),  			    GFP_KERNEL, cpu_to_node(cpu)); @@ -1068,15 +1201,16 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)  	rb_check_bpage(cpu_buffer, bpage);  	cpu_buffer->reader_page = bpage; -	addr = __get_free_page(GFP_KERNEL); -	if (!addr) +	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); +	if (!page)  		goto fail_free_reader; -	bpage->page = (void *)addr; +	bpage->page = page_address(page);  	rb_init_page(bpage->page);  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); +	INIT_LIST_HEAD(&cpu_buffer->new_pages); -	ret = rb_allocate_pages(cpu_buffer, buffer->pages); +	ret = rb_allocate_pages(cpu_buffer, nr_pages);  	if (ret < 0)  		goto fail_free_reader; @@ -1123,7 +1257,7 @@ static int rb_cpu_notify(struct notifier_block *self,  #endif  /** - * ring_buffer_alloc - allocate a new ring_buffer + * __ring_buffer_alloc - allocate a new ring_buffer   * @size: the size in bytes per cpu that is needed.   * @flags: attributes to set for the ring buffer.   * @@ -1137,7 +1271,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  {  	struct ring_buffer *buffer;  	int bsize; -	int cpu; +	int cpu, nr_pages;  	/* keep it in its own cache line */  	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1148,14 +1282,17 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))  		goto fail_free_buffer; -	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); +	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);  	buffer->flags = flags;  	buffer->clock = trace_clock_local;  	buffer->reader_lock_key = key; +	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); +	init_waitqueue_head(&buffer->irq_work.waiters); +  	/* need at least two pages */ -	if (buffer->pages < 2) -		buffer->pages = 2; +	if (nr_pages < 2) +		nr_pages = 2;  	/*  	 * In case of non-hotplug cpu, if the ring-buffer is allocated @@ -1163,7 +1300,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	 * In that off case, we need to allocate for all possible cpus.  	 */  #ifdef CONFIG_HOTPLUG_CPU -	get_online_cpus(); +	cpu_notifier_register_begin();  	cpumask_copy(buffer->cpumask, cpu_online_mask);  #else  	cpumask_copy(buffer->cpumask, cpu_possible_mask); @@ -1178,7 +1315,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  	for_each_buffer_cpu(buffer, cpu) {  		buffer->buffers[cpu] = -			rb_allocate_cpu_buffer(buffer, cpu); +			rb_allocate_cpu_buffer(buffer, nr_pages, cpu);  		if (!buffer->buffers[cpu])  			goto fail_free_buffers;  	} @@ -1186,10 +1323,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,  #ifdef CONFIG_HOTPLUG_CPU  	buffer->cpu_notify.notifier_call = rb_cpu_notify;  	buffer->cpu_notify.priority = 0; -	register_cpu_notifier(&buffer->cpu_notify); +	__register_cpu_notifier(&buffer->cpu_notify); +	cpu_notifier_register_done();  #endif -	put_online_cpus();  	mutex_init(&buffer->mutex);  	return buffer; @@ -1203,7 +1340,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,   fail_free_cpumask:  	free_cpumask_var(buffer->cpumask); -	put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU +	cpu_notifier_register_done(); +#endif   fail_free_buffer:  	kfree(buffer); @@ -1220,16 +1359,17 @@ ring_buffer_free(struct ring_buffer *buffer)  {  	int cpu; -	get_online_cpus(); -  #ifdef CONFIG_HOTPLUG_CPU -	unregister_cpu_notifier(&buffer->cpu_notify); +	cpu_notifier_register_begin(); +	__unregister_cpu_notifier(&buffer->cpu_notify);  #endif  	for_each_buffer_cpu(buffer, cpu)  		rb_free_cpu_buffer(buffer->buffers[cpu]); -	put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU +	cpu_notifier_register_done(); +#endif  	kfree(buffer->buffers);  	free_cpumask_var(buffer->cpumask); @@ -1246,78 +1386,241 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,  static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); -static void -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +static inline unsigned long rb_page_entries(struct buffer_page *bpage)  { -	struct buffer_page *bpage; -	struct list_head *p; -	unsigned i; +	return local_read(&bpage->entries) & RB_WRITE_MASK; +} -	spin_lock_irq(&cpu_buffer->reader_lock); -	rb_head_page_deactivate(cpu_buffer); +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ +	return local_read(&bpage->write) & RB_WRITE_MASK; +} -	for (i = 0; i < nr_pages; i++) { -		if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) -			goto out; -		p = cpu_buffer->pages->next; -		bpage = list_entry(p, struct buffer_page, list); -		list_del_init(&bpage->list); -		free_buffer_page(bpage); +static int +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +{ +	struct list_head *tail_page, *to_remove, *next_page; +	struct buffer_page *to_remove_page, *tmp_iter_page; +	struct buffer_page *last_page, *first_page; +	unsigned int nr_removed; +	unsigned long head_bit; +	int page_entries; + +	head_bit = 0; + +	raw_spin_lock_irq(&cpu_buffer->reader_lock); +	atomic_inc(&cpu_buffer->record_disabled); +	/* +	 * We don't race with the readers since we have acquired the reader +	 * lock. We also don't race with writers after disabling recording. +	 * This makes it easy to figure out the first and the last page to be +	 * removed from the list. We unlink all the pages in between including +	 * the first and last pages. This is done in a busy loop so that we +	 * lose the least number of traces. +	 * The pages are freed after we restart recording and unlock readers. +	 */ +	tail_page = &cpu_buffer->tail_page->list; + +	/* +	 * tail page might be on reader page, we remove the next page +	 * from the ring buffer +	 */ +	if (cpu_buffer->tail_page == cpu_buffer->reader_page) +		tail_page = rb_list_head(tail_page->next); +	to_remove = tail_page; + +	/* start of pages to remove */ +	first_page = list_entry(rb_list_head(to_remove->next), +				struct buffer_page, list); + +	for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { +		to_remove = rb_list_head(to_remove)->next; +		head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;  	} -	if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) -		goto out; -	rb_reset_cpu(cpu_buffer); -	rb_check_pages(cpu_buffer); +	next_page = rb_list_head(to_remove)->next; -out: -	spin_unlock_irq(&cpu_buffer->reader_lock); +	/* +	 * Now we remove all pages between tail_page and next_page. +	 * Make sure that we have head_bit value preserved for the +	 * next page +	 */ +	tail_page->next = (struct list_head *)((unsigned long)next_page | +						head_bit); +	next_page = rb_list_head(next_page); +	next_page->prev = tail_page; + +	/* make sure pages points to a valid page in the ring buffer */ +	cpu_buffer->pages = next_page; + +	/* update head page */ +	if (head_bit) +		cpu_buffer->head_page = list_entry(next_page, +						struct buffer_page, list); + +	/* +	 * change read pointer to make sure any read iterators reset +	 * themselves +	 */ +	cpu_buffer->read = 0; + +	/* pages are removed, resume tracing and then free the pages */ +	atomic_dec(&cpu_buffer->record_disabled); +	raw_spin_unlock_irq(&cpu_buffer->reader_lock); + +	RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + +	/* last buffer page to remove */ +	last_page = list_entry(rb_list_head(to_remove), struct buffer_page, +				list); +	tmp_iter_page = first_page; + +	do { +		to_remove_page = tmp_iter_page; +		rb_inc_page(cpu_buffer, &tmp_iter_page); + +		/* update the counters */ +		page_entries = rb_page_entries(to_remove_page); +		if (page_entries) { +			/* +			 * If something was added to this page, it was full +			 * since it is not the tail page. So we deduct the +			 * bytes consumed in ring buffer from here. +			 * Increment overrun to account for the lost events. +			 */ +			local_add(page_entries, &cpu_buffer->overrun); +			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); +		} + +		/* +		 * We have already removed references to this list item, just +		 * free up the buffer_page and its page +		 */ +		free_buffer_page(to_remove_page); +		nr_removed--; + +	} while (to_remove_page != last_page); + +	RB_WARN_ON(cpu_buffer, nr_removed); + +	return nr_removed == 0;  } -static void -rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, -		struct list_head *pages, unsigned nr_pages) +static int +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)  { -	struct buffer_page *bpage; -	struct list_head *p; -	unsigned i; +	struct list_head *pages = &cpu_buffer->new_pages; +	int retries, success; -	spin_lock_irq(&cpu_buffer->reader_lock); -	rb_head_page_deactivate(cpu_buffer); +	raw_spin_lock_irq(&cpu_buffer->reader_lock); +	/* +	 * We are holding the reader lock, so the reader page won't be swapped +	 * in the ring buffer. Now we are racing with the writer trying to +	 * move head page and the tail page. +	 * We are going to adapt the reader page update process where: +	 * 1. We first splice the start and end of list of new pages between +	 *    the head page and its previous page. +	 * 2. We cmpxchg the prev_page->next to point from head page to the +	 *    start of new pages list. +	 * 3. Finally, we update the head->prev to the end of new list. +	 * +	 * We will try this process 10 times, to make sure that we don't keep +	 * spinning. +	 */ +	retries = 10; +	success = 0; +	while (retries--) { +		struct list_head *head_page, *prev_page, *r; +		struct list_head *last_page, *first_page; +		struct list_head *head_page_with_bit; -	for (i = 0; i < nr_pages; i++) { -		if (RB_WARN_ON(cpu_buffer, list_empty(pages))) -			goto out; -		p = pages->next; -		bpage = list_entry(p, struct buffer_page, list); -		list_del_init(&bpage->list); -		list_add_tail(&bpage->list, cpu_buffer->pages); +		head_page = &rb_set_head_page(cpu_buffer)->list; +		if (!head_page) +			break; +		prev_page = head_page->prev; + +		first_page = pages->next; +		last_page  = pages->prev; + +		head_page_with_bit = (struct list_head *) +				     ((unsigned long)head_page | RB_PAGE_HEAD); + +		last_page->next = head_page_with_bit; +		first_page->prev = prev_page; + +		r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + +		if (r == head_page_with_bit) { +			/* +			 * yay, we replaced the page pointer to our new list, +			 * now, we just have to update to head page's prev +			 * pointer to point to end of list +			 */ +			head_page->prev = last_page; +			success = 1; +			break; +		}  	} -	rb_reset_cpu(cpu_buffer); -	rb_check_pages(cpu_buffer); -out: -	spin_unlock_irq(&cpu_buffer->reader_lock); +	if (success) +		INIT_LIST_HEAD(pages); +	/* +	 * If we weren't successful in adding in new pages, warn and stop +	 * tracing +	 */ +	RB_WARN_ON(cpu_buffer, !success); +	raw_spin_unlock_irq(&cpu_buffer->reader_lock); + +	/* free pages if they weren't inserted */ +	if (!success) { +		struct buffer_page *bpage, *tmp; +		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, +					 list) { +			list_del_init(&bpage->list); +			free_buffer_page(bpage); +		} +	} +	return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ +	int success; + +	if (cpu_buffer->nr_pages_to_update > 0) +		success = rb_insert_pages(cpu_buffer); +	else +		success = rb_remove_pages(cpu_buffer, +					-cpu_buffer->nr_pages_to_update); + +	if (success) +		cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ +	struct ring_buffer_per_cpu *cpu_buffer = container_of(work, +			struct ring_buffer_per_cpu, update_pages_work); +	rb_update_pages(cpu_buffer); +	complete(&cpu_buffer->update_done);  }  /**   * ring_buffer_resize - resize the ring buffer   * @buffer: the buffer to resize.   * @size: the new size. + * @cpu_id: the cpu buffer to resize   *   * Minimum size is 2 * BUF_PAGE_SIZE.   * - * Returns -1 on failure. + * Returns 0 on success and < 0 on failure.   */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, +			int cpu_id)  {  	struct ring_buffer_per_cpu *cpu_buffer; -	unsigned nr_pages, rm_pages, new_pages; -	struct buffer_page *bpage, *tmp; -	unsigned long buffer_size; -	unsigned long addr; -	LIST_HEAD(pages); -	int i, cpu; +	unsigned nr_pages; +	int cpu, err = 0;  	/*  	 * Always succeed at resizing a non-existent buffer: @@ -1325,109 +1628,199 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)  	if (!buffer)  		return size; +	/* Make sure the requested buffer exists */ +	if (cpu_id != RING_BUFFER_ALL_CPUS && +	    !cpumask_test_cpu(cpu_id, buffer->cpumask)) +		return size; +  	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);  	size *= BUF_PAGE_SIZE; -	buffer_size = buffer->pages * BUF_PAGE_SIZE;  	/* we need a minimum of two pages */  	if (size < BUF_PAGE_SIZE * 2)  		size = BUF_PAGE_SIZE * 2; -	if (size == buffer_size) -		return size; - -	atomic_inc(&buffer->record_disabled); +	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); -	/* Make sure all writers are done with this buffer. */ -	synchronize_sched(); +	/* +	 * Don't succeed if resizing is disabled, as a reader might be +	 * manipulating the ring buffer and is expecting a sane state while +	 * this is true. +	 */ +	if (atomic_read(&buffer->resize_disabled)) +		return -EBUSY; +	/* prevent another thread from changing buffer sizes */  	mutex_lock(&buffer->mutex); -	get_online_cpus(); - -	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); -	if (size < buffer_size) { +	if (cpu_id == RING_BUFFER_ALL_CPUS) { +		/* calculate the pages to update */ +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; -		/* easy case, just free pages */ -		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) -			goto out_fail; +			cpu_buffer->nr_pages_to_update = nr_pages - +							cpu_buffer->nr_pages; +			/* +			 * nothing more to do for removing pages or no update +			 */ +			if (cpu_buffer->nr_pages_to_update <= 0) +				continue; +			/* +			 * to add pages, make sure all new pages can be +			 * allocated without receiving ENOMEM +			 */ +			INIT_LIST_HEAD(&cpu_buffer->new_pages); +			if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, +						&cpu_buffer->new_pages, cpu)) { +				/* not enough memory for new pages */ +				err = -ENOMEM; +				goto out_err; +			} +		} -		rm_pages = buffer->pages - nr_pages; +		get_online_cpus(); +		/* +		 * Fire off all the required work handlers +		 * We can't schedule on offline CPUs, but it's not necessary +		 * since we can change their buffer sizes without any race. +		 */ +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; +			if (!cpu_buffer->nr_pages_to_update) +				continue; + +			/* The update must run on the CPU that is being updated. */ +			preempt_disable(); +			if (cpu == smp_processor_id() || !cpu_online(cpu)) { +				rb_update_pages(cpu_buffer); +				cpu_buffer->nr_pages_to_update = 0; +			} else { +				/* +				 * Can not disable preemption for schedule_work_on() +				 * on PREEMPT_RT. +				 */ +				preempt_enable(); +				schedule_work_on(cpu, +						&cpu_buffer->update_pages_work); +				preempt_disable(); +			} +			preempt_enable(); +		} +		/* wait for all the updates to complete */  		for_each_buffer_cpu(buffer, cpu) {  			cpu_buffer = buffer->buffers[cpu]; -			rb_remove_pages(cpu_buffer, rm_pages); +			if (!cpu_buffer->nr_pages_to_update) +				continue; + +			if (cpu_online(cpu)) +				wait_for_completion(&cpu_buffer->update_done); +			cpu_buffer->nr_pages_to_update = 0;  		} -		goto out; -	} -	/* -	 * This is a bit more difficult. We only want to add pages -	 * when we can allocate enough for all CPUs. We do this -	 * by allocating all the pages and storing them on a local -	 * link list. If we succeed in our allocation, then we -	 * add these pages to the cpu_buffers. Otherwise we just free -	 * them all and return -ENOMEM; -	 */ -	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) -		goto out_fail; +		put_online_cpus(); +	} else { +		/* Make sure this CPU has been intitialized */ +		if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) +			goto out; -	new_pages = nr_pages - buffer->pages; +		cpu_buffer = buffer->buffers[cpu_id]; -	for_each_buffer_cpu(buffer, cpu) { -		for (i = 0; i < new_pages; i++) { -			bpage = kzalloc_node(ALIGN(sizeof(*bpage), -						  cache_line_size()), -					    GFP_KERNEL, cpu_to_node(cpu)); -			if (!bpage) -				goto free_pages; -			list_add(&bpage->list, &pages); -			addr = __get_free_page(GFP_KERNEL); -			if (!addr) -				goto free_pages; -			bpage->page = (void *)addr; -			rb_init_page(bpage->page); +		if (nr_pages == cpu_buffer->nr_pages) +			goto out; + +		cpu_buffer->nr_pages_to_update = nr_pages - +						cpu_buffer->nr_pages; + +		INIT_LIST_HEAD(&cpu_buffer->new_pages); +		if (cpu_buffer->nr_pages_to_update > 0 && +			__rb_allocate_pages(cpu_buffer->nr_pages_to_update, +					    &cpu_buffer->new_pages, cpu_id)) { +			err = -ENOMEM; +			goto out_err;  		} -	} -	for_each_buffer_cpu(buffer, cpu) { -		cpu_buffer = buffer->buffers[cpu]; -		rb_insert_pages(cpu_buffer, &pages, new_pages); -	} +		get_online_cpus(); -	if (RB_WARN_ON(buffer, !list_empty(&pages))) -		goto out_fail; +		preempt_disable(); +		/* The update must run on the CPU that is being updated. */ +		if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) +			rb_update_pages(cpu_buffer); +		else { +			/* +			 * Can not disable preemption for schedule_work_on() +			 * on PREEMPT_RT. +			 */ +			preempt_enable(); +			schedule_work_on(cpu_id, +					 &cpu_buffer->update_pages_work); +			wait_for_completion(&cpu_buffer->update_done); +			preempt_disable(); +		} +		preempt_enable(); + +		cpu_buffer->nr_pages_to_update = 0; +		put_online_cpus(); +	}   out: -	buffer->pages = nr_pages; -	put_online_cpus(); +	/* +	 * The ring buffer resize can happen with the ring buffer +	 * enabled, so that the update disturbs the tracing as little +	 * as possible. But if the buffer is disabled, we do not need +	 * to worry about that, and we can take the time to verify +	 * that the buffer is not corrupt. +	 */ +	if (atomic_read(&buffer->record_disabled)) { +		atomic_inc(&buffer->record_disabled); +		/* +		 * Even though the buffer was disabled, we must make sure +		 * that it is truly disabled before calling rb_check_pages. +		 * There could have been a race between checking +		 * record_disable and incrementing it. +		 */ +		synchronize_sched(); +		for_each_buffer_cpu(buffer, cpu) { +			cpu_buffer = buffer->buffers[cpu]; +			rb_check_pages(cpu_buffer); +		} +		atomic_dec(&buffer->record_disabled); +	} +  	mutex_unlock(&buffer->mutex); +	return size; -	atomic_dec(&buffer->record_disabled); + out_err: +	for_each_buffer_cpu(buffer, cpu) { +		struct buffer_page *bpage, *tmp; -	return size; +		cpu_buffer = buffer->buffers[cpu]; +		cpu_buffer->nr_pages_to_update = 0; - free_pages: -	list_for_each_entry_safe(bpage, tmp, &pages, list) { -		list_del_init(&bpage->list); -		free_buffer_page(bpage); +		if (list_empty(&cpu_buffer->new_pages)) +			continue; + +		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, +					list) { +			list_del_init(&bpage->list); +			free_buffer_page(bpage); +		}  	} -	put_online_cpus();  	mutex_unlock(&buffer->mutex); -	atomic_dec(&buffer->record_disabled); -	return -ENOMEM; +	return err; +} +EXPORT_SYMBOL_GPL(ring_buffer_resize); -	/* -	 * Something went totally wrong, and we are too paranoid -	 * to even clean up the mess. -	 */ - out_fail: -	put_online_cpus(); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ +	mutex_lock(&buffer->mutex); +	if (val) +		buffer->flags |= RB_FL_OVERWRITE; +	else +		buffer->flags &= ~RB_FL_OVERWRITE;  	mutex_unlock(&buffer->mutex); -	atomic_dec(&buffer->record_disabled); -	return -1;  } -EXPORT_SYMBOL_GPL(ring_buffer_resize); +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);  static inline void *  __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) @@ -1453,22 +1846,12 @@ rb_iter_head_event(struct ring_buffer_iter *iter)  	return __rb_page_index(iter->head_page, iter->head);  } -static inline unsigned long rb_page_write(struct buffer_page *bpage) -{ -	return local_read(&bpage->write) & RB_WRITE_MASK; -} -  static inline unsigned rb_page_commit(struct buffer_page *bpage)  {  	return local_read(&bpage->page->commit);  } -static inline unsigned long rb_page_entries(struct buffer_page *bpage) -{ -	return local_read(&bpage->entries) & RB_WRITE_MASK; -} - -/* Size is determined by what has been commited */ +/* Size is determined by what has been committed */  static inline unsigned rb_page_size(struct buffer_page *bpage)  {  	return rb_page_commit(bpage); @@ -1516,7 +1899,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)  	 * assign the commit to the tail.  	 */   again: -	max_count = cpu_buffer->buffer->pages * 100; +	max_count = cpu_buffer->nr_pages * 100;  	while (cpu_buffer->commit_page != cpu_buffer->tail_page) {  		if (RB_WARN_ON(cpu_buffer, !(--max_count))) @@ -1600,7 +1983,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)  }  /** - * ring_buffer_update_event - update event type and data + * rb_update_event - update event type and data   * @event: the even to update   * @type: the type of event   * @length: the size of the event field in the ring buffer @@ -1684,6 +2067,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,  		 * the counters.  		 */  		local_add(entries, &cpu_buffer->overrun); +		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);  		/*  		 * The entries will be zeroed out when we move the @@ -1839,6 +2223,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,  	event = __rb_page_index(tail_page, tail);  	kmemcheck_annotate_bitfield(event, bitfield); +	/* account for padding bytes */ +	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); +  	/*  	 * Save the original length to the meta data.  	 * This will be used by the reader to add lost event @@ -1931,8 +2318,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,  			 * If we are not in overwrite mode,  			 * this is easy, just stop here.  			 */ -			if (!(buffer->flags & RB_FL_OVERWRITE)) +			if (!(buffer->flags & RB_FL_OVERWRITE)) { +				local_inc(&cpu_buffer->dropped_events);  				goto out_reset; +			}  			ret = rb_handle_head_page(cpu_buffer,  						  tail_page, @@ -2010,6 +2399,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  	write &= RB_WRITE_MASK;  	tail = write - length; +	/* +	 * If this is the first commit on the page, then it has the same +	 * timestamp as the page itself. +	 */ +	if (!tail) +		delta = 0; +  	/* See if we shot pass the end of this buffer page */  	if (unlikely(write > BUF_PAGE_SIZE))  		return rb_move_tail(cpu_buffer, length, tail, @@ -2030,6 +2426,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  	if (!tail)  		tail_page->page->time_stamp = ts; +	/* account for these added bytes */ +	local_add(length, &cpu_buffer->entries_bytes); +  	return event;  } @@ -2052,6 +2451,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,  	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {  		unsigned long write_mask =  			local_read(&bpage->write) & ~RB_WRITE_MASK; +		unsigned long event_length = rb_event_length(event);  		/*  		 * This is on the tail page. It is possible that  		 * a write could come in and move the tail page @@ -2061,8 +2461,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,  		old_index += write_mask;  		new_index += write_mask;  		index = local_cmpxchg(&bpage->write, old_index, new_index); -		if (index == old_index) +		if (index == old_index) { +			/* update counters */ +			local_sub(event_length, &cpu_buffer->entries_bytes);  			return 1; +		}  	}  	/* could not discard */ @@ -2162,11 +2565,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,  	if (likely(ts >= cpu_buffer->write_stamp)) {  		delta = diff;  		if (unlikely(test_time_stamp(delta))) { +			int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +			local_clock_stable = sched_clock_stable(); +#endif  			WARN_ONCE(delta > (1ULL << 59), -				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", +				  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",  				  (unsigned long long)delta,  				  (unsigned long long)ts, -				  (unsigned long long)cpu_buffer->write_stamp); +				  (unsigned long long)cpu_buffer->write_stamp, +				  local_clock_stable ? "" : +				  "If you just came from a suspend/resume,\n" +				  "please switch to the trace global clock:\n" +				  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");  			add_timestamp = 1;  		}  	} @@ -2188,41 +2599,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,  #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + *  bit 0 =  NMI context + *  bit 1 =  IRQ context + *  bit 2 =  SoftIRQ context + *  bit 3 =  normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + *  101 - 1 = 100 + *  101 & 100 = 100 (clearing bit zero) + * + *  1010 - 1 = 1001 + *  1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void)  { -	/* Disable all tracing before we do anything else */ -	tracing_off_permanent(); +	unsigned int val = this_cpu_read(current_context); +	int bit; -	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" -		    "HC[%lu]:SC[%lu]:NMI[%lu]\n", -		    current->trace_recursion, -		    hardirq_count() >> HARDIRQ_SHIFT, -		    softirq_count() >> SOFTIRQ_SHIFT, -		    in_nmi()); - -	WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ -	current->trace_recursion++; +	if (in_interrupt()) { +		if (in_nmi()) +			bit = 0; +		else if (in_irq()) +			bit = 1; +		else +			bit = 2; +	} else +		bit = 3; -	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) -		return 0; +	if (unlikely(val & (1 << bit))) +		return 1; -	trace_recursive_fail(); +	val |= (1 << bit); +	this_cpu_write(current_context, val); -	return -1; +	return 0;  } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void)  { -	WARN_ON_ONCE(!current->trace_recursion); +	unsigned int val = this_cpu_read(current_context); -	current->trace_recursion--; +	val--; +	val &= this_cpu_read(current_context); +	this_cpu_write(current_context, val);  }  #else @@ -2330,6 +2776,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,  	rb_end_commit(cpu_buffer);  } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ +	if (buffer->irq_work.waiters_pending) { +		buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&buffer->irq_work.work); +	} + +	if (cpu_buffer->irq_work.waiters_pending) { +		cpu_buffer->irq_work.waiters_pending = false; +		/* irq_work_queue() supplies it's own memory barriers */ +		irq_work_queue(&cpu_buffer->irq_work.work); +	} +} +  /**   * ring_buffer_unlock_commit - commit a reserved   * @buffer: The buffer to commit to @@ -2349,6 +2811,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	trace_recursive_unlock();  	preempt_enable_notrace(); @@ -2481,8 +2945,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);   * and not the length of the event which would hold the header.   */  int ring_buffer_write(struct ring_buffer *buffer, -			unsigned long length, -			void *data) +		      unsigned long length, +		      void *data)  {  	struct ring_buffer_per_cpu *cpu_buffer;  	struct ring_buffer_event *event; @@ -2521,6 +2985,8 @@ int ring_buffer_write(struct ring_buffer *buffer,  	rb_commit(cpu_buffer, event); +	rb_wakeups(buffer, cpu_buffer); +  	ret = 0;   out:  	preempt_enable_notrace(); @@ -2574,6 +3040,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)  EXPORT_SYMBOL_GPL(ring_buffer_record_enable);  /** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() version + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct ring_buffer *buffer) +{ +	unsigned int rd; +	unsigned int new_rd; + +	do { +		rd = atomic_read(&buffer->record_disabled); +		new_rd = rd | RB_BUFFER_OFF; +	} while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() version + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct ring_buffer *buffer) +{ +	unsigned int rd; +	unsigned int new_rd; + +	do { +		rd = atomic_read(&buffer->record_disabled); +		new_rd = rd & ~RB_BUFFER_OFF; +	} while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +int ring_buffer_record_is_on(struct ring_buffer *buffer) +{ +	return !atomic_read(&buffer->record_disabled); +} + +/**   * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer   * @buffer: The ring buffer to stop writes to.   * @cpu: The CPU buffer to stop @@ -2629,6 +3152,59 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)  }  /** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ +	unsigned long flags; +	struct ring_buffer_per_cpu *cpu_buffer; +	struct buffer_page *bpage; +	u64 ret = 0; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	/* +	 * if the tail is on reader_page, oldest time stamp is on the reader +	 * page +	 */ +	if (cpu_buffer->tail_page == cpu_buffer->reader_page) +		bpage = cpu_buffer->reader_page; +	else +		bpage = rb_set_head_page(cpu_buffer); +	if (bpage) +		ret = bpage->page->time_stamp; +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long ret; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + +/**   * ring_buffer_entries_cpu - get the number of entries in a cpu buffer   * @buffer: The ring buffer   * @cpu: The per CPU buffer to get the entries from. @@ -2647,7 +3223,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)  EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);  /** - * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * ring_buffer_overrun_cpu - get the number of overruns caused by the ring + * buffer wrapping around (only if RB_FL_OVERWRITE is on).   * @buffer: The ring buffer   * @cpu: The per CPU buffer to get the number of overruns from   */ @@ -2667,7 +3244,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)  EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);  /** - * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by + * commits failing due to the buffer wrapping around while there are uncommitted + * events, such as during an interrupt storm.   * @buffer: The ring buffer   * @cpu: The per CPU buffer to get the number of overruns from   */ @@ -2688,6 +3267,46 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)  EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);  /** + * ring_buffer_dropped_events_cpu - get the number of dropped events caused by + * the ring buffer filling up (only if RB_FL_OVERWRITE is off). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; +	unsigned long ret; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	ret = local_read(&cpu_buffer->dropped_events); + +	return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); + +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ +	struct ring_buffer_per_cpu *cpu_buffer; + +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	cpu_buffer = buffer->buffers[cpu]; +	return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + +/**   * ring_buffer_entries - get the number of entries in a buffer   * @buffer: The ring buffer   * @@ -2772,9 +3391,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)  	cpu_buffer = iter->cpu_buffer; -	spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	rb_iter_reset(iter); -	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  }  EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -2895,6 +3514,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)  	if (cpu_buffer->commit_page == cpu_buffer->reader_page)  		goto out; +	/* Don't bother swapping if the ring buffer is empty */ +	if (rb_num_of_entries(cpu_buffer) == 0) +		goto out; +  	/*  	 * Reset the reader page to size zero.  	 */ @@ -2908,13 +3531,15 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)  	 * Splice the empty reader page into the list around the head.  	 */  	reader = rb_set_head_page(cpu_buffer); +	if (!reader) +		goto out;  	cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);  	cpu_buffer->reader_page->list.prev = reader->list.prev;  	/*  	 * cpu_buffer->pages just needs to point to the buffer, it  	 *  has no specific buffer page to point to. Lets move it out -	 *  of our way so we don't accidently swap it. +	 *  of our way so we don't accidentally swap it.  	 */  	cpu_buffer->pages = reader->list.prev; @@ -3040,7 +3665,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)  	/* check for end of page padding */  	if ((iter->head >= rb_page_size(iter->head_page)) &&  	    (iter->head_page != cpu_buffer->commit_page)) -		rb_advance_iter(iter); +		rb_inc_iter(iter);  }  static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) @@ -3233,12 +3858,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,   again:  	local_irq_save(flags);  	if (dolock) -		spin_lock(&cpu_buffer->reader_lock); +		raw_spin_lock(&cpu_buffer->reader_lock);  	event = rb_buffer_peek(cpu_buffer, ts, lost_events);  	if (event && event->type_len == RINGBUF_TYPE_PADDING)  		rb_advance_reader(cpu_buffer);  	if (dolock) -		spin_unlock(&cpu_buffer->reader_lock); +		raw_spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags);  	if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3263,9 +3888,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  	unsigned long flags;   again: -	spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	event = rb_iter_peek(iter, ts); -	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  	if (event && event->type_len == RINGBUF_TYPE_PADDING)  		goto again; @@ -3305,7 +3930,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,  	cpu_buffer = buffer->buffers[cpu];  	local_irq_save(flags);  	if (dolock) -		spin_lock(&cpu_buffer->reader_lock); +		raw_spin_lock(&cpu_buffer->reader_lock);  	event = rb_buffer_peek(cpu_buffer, ts, lost_events);  	if (event) { @@ -3314,7 +3939,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,  	}  	if (dolock) -		spin_unlock(&cpu_buffer->reader_lock); +		raw_spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags);   out: @@ -3341,11 +3966,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);   * expected.   *   * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. + * expected to make at least one call to ring_buffer_read_prepare_sync.   * Afterwards, ring_buffer_read_start is invoked to get things going   * for real.   * - * This overall must be paired with ring_buffer_finish. + * This overall must be paired with ring_buffer_read_finish.   */  struct ring_buffer_iter *  ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) @@ -3364,6 +3989,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)  	iter->cpu_buffer = cpu_buffer; +	atomic_inc(&buffer->resize_disabled);  	atomic_inc(&cpu_buffer->record_disabled);  	return iter; @@ -3393,7 +4019,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);   * an intervening ring_buffer_read_prepare_sync must have been   * performed.   * - * Must be paired with ring_buffer_finish. + * Must be paired with ring_buffer_read_finish.   */  void  ring_buffer_read_start(struct ring_buffer_iter *iter) @@ -3406,16 +4032,16 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)  	cpu_buffer = iter->cpu_buffer; -	spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	arch_spin_lock(&cpu_buffer->lock);  	rb_iter_reset(iter);  	arch_spin_unlock(&cpu_buffer->lock); -	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  }  EXPORT_SYMBOL_GPL(ring_buffer_read_start);  /** - * ring_buffer_finish - finish reading the iterator of the buffer + * ring_buffer_read_finish - finish reading the iterator of the buffer   * @iter: The iterator retrieved by ring_buffer_start   *   * This re-enables the recording to the buffer, and frees the @@ -3425,8 +4051,20 @@ void  ring_buffer_read_finish(struct ring_buffer_iter *iter)  {  	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; +	unsigned long flags; + +	/* +	 * Ring buffer is disabled from recording, here's a good place +	 * to check the integrity of the ring buffer. +	 * Must prevent readers from trying to read, as the check +	 * clears the HEAD page and readers require it. +	 */ +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	rb_check_pages(cpu_buffer); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  	atomic_dec(&cpu_buffer->record_disabled); +	atomic_dec(&cpu_buffer->buffer->resize_disabled);  	kfree(iter);  }  EXPORT_SYMBOL_GPL(ring_buffer_read_finish); @@ -3445,7 +4083,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)  	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;  	unsigned long flags; -	spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);   again:  	event = rb_iter_peek(iter, ts);  	if (!event) @@ -3456,7 +4094,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)  	rb_advance_iter(iter);   out: -	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  	return event;  } @@ -3466,9 +4104,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);   * ring_buffer_size - return the size of the ring buffer (in bytes)   * @buffer: The ring buffer.   */ -unsigned long ring_buffer_size(struct ring_buffer *buffer) +unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)  { -	return BUF_PAGE_SIZE * buffer->pages; +	/* +	 * Earlier, this method returned +	 *	BUF_PAGE_SIZE * buffer->nr_pages +	 * Since the nr_pages field is now removed, we have converted this to +	 * return the per cpu buffer value. +	 */ +	if (!cpumask_test_cpu(cpu, buffer->cpumask)) +		return 0; + +	return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;  }  EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -3489,17 +4136,21 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)  	cpu_buffer->commit_page = cpu_buffer->head_page;  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); +	INIT_LIST_HEAD(&cpu_buffer->new_pages);  	local_set(&cpu_buffer->reader_page->write, 0);  	local_set(&cpu_buffer->reader_page->entries, 0);  	local_set(&cpu_buffer->reader_page->page->commit, 0);  	cpu_buffer->reader_page->read = 0; -	local_set(&cpu_buffer->commit_overrun, 0); +	local_set(&cpu_buffer->entries_bytes, 0);  	local_set(&cpu_buffer->overrun, 0); +	local_set(&cpu_buffer->commit_overrun, 0); +	local_set(&cpu_buffer->dropped_events, 0);  	local_set(&cpu_buffer->entries, 0);  	local_set(&cpu_buffer->committing, 0);  	local_set(&cpu_buffer->commits, 0);  	cpu_buffer->read = 0; +	cpu_buffer->read_bytes = 0;  	cpu_buffer->write_stamp = 0;  	cpu_buffer->read_stamp = 0; @@ -3523,9 +4174,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	if (!cpumask_test_cpu(cpu, buffer->cpumask))  		return; +	atomic_inc(&buffer->resize_disabled);  	atomic_inc(&cpu_buffer->record_disabled); -	spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	/* Make sure all commits have finished */ +	synchronize_sched(); + +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))  		goto out; @@ -3537,9 +4192,10 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)  	arch_spin_unlock(&cpu_buffer->lock);   out: -	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);  	atomic_dec(&cpu_buffer->record_disabled); +	atomic_dec(&buffer->resize_disabled);  }  EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -3575,10 +4231,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)  		cpu_buffer = buffer->buffers[cpu];  		local_irq_save(flags);  		if (dolock) -			spin_lock(&cpu_buffer->reader_lock); +			raw_spin_lock(&cpu_buffer->reader_lock);  		ret = rb_per_cpu_empty(cpu_buffer);  		if (dolock) -			spin_unlock(&cpu_buffer->reader_lock); +			raw_spin_unlock(&cpu_buffer->reader_lock);  		local_irq_restore(flags);  		if (!ret) @@ -3609,10 +4265,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)  	cpu_buffer = buffer->buffers[cpu];  	local_irq_save(flags);  	if (dolock) -		spin_lock(&cpu_buffer->reader_lock); +		raw_spin_lock(&cpu_buffer->reader_lock);  	ret = rb_per_cpu_empty(cpu_buffer);  	if (dolock) -		spin_unlock(&cpu_buffer->reader_lock); +		raw_spin_unlock(&cpu_buffer->reader_lock);  	local_irq_restore(flags);  	return ret; @@ -3641,8 +4297,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,  	    !cpumask_test_cpu(cpu, buffer_b->cpumask))  		goto out; +	cpu_buffer_a = buffer_a->buffers[cpu]; +	cpu_buffer_b = buffer_b->buffers[cpu]; +  	/* At least make sure the two buffers are somewhat the same */ -	if (buffer_a->pages != buffer_b->pages) +	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)  		goto out;  	ret = -EAGAIN; @@ -3656,9 +4315,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,  	if (atomic_read(&buffer_b->record_disabled))  		goto out; -	cpu_buffer_a = buffer_a->buffers[cpu]; -	cpu_buffer_b = buffer_b->buffers[cpu]; -  	if (atomic_read(&cpu_buffer_a->record_disabled))  		goto out; @@ -3700,6 +4356,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);  /**   * ring_buffer_alloc_read_page - allocate a page to read from buffer   * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate.   *   * This function is used in conjunction with ring_buffer_read_page.   * When reading a full page from the ring buffer, these functions @@ -3712,16 +4369,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);   * Returns:   *  The page allocated, or NULL on error.   */ -void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)  {  	struct buffer_data_page *bpage; -	unsigned long addr; +	struct page *page; -	addr = __get_free_page(GFP_KERNEL); -	if (!addr) +	page = alloc_pages_node(cpu_to_node(cpu), +				GFP_KERNEL | __GFP_NORETRY, 0); +	if (!page)  		return NULL; -	bpage = (void *)addr; +	bpage = page_address(page);  	rb_init_page(bpage); @@ -3756,7 +4414,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);   * to swap with a page in the ring buffer.   *   * for example: - *	rpage = ring_buffer_alloc_read_page(buffer); + *	rpage = ring_buffer_alloc_read_page(buffer, cpu);   *	if (!rpage)   *		return error;   *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); @@ -3808,7 +4466,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  	if (!bpage)  		goto out; -	spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);  	reader = rb_get_reader_page(cpu_buffer);  	if (!reader) @@ -3853,6 +4511,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  		/* Need to copy one event at a time */  		do { +			/* We need the size of one event, because +			 * rb_advance_reader only advances by one event, +			 * whereas rb_event_ts_length may include the size of +			 * one or two events. +			 * We have already ensured there's enough space if this +			 * is a time extend. */ +			size = rb_event_length(event);  			memcpy(bpage->data + pos, rpage->data + rpos, size);  			len -= size; @@ -3867,7 +4532,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  			event = rb_reader_event(cpu_buffer);  			/* Always keep the time extend and data together */  			size = rb_event_ts_length(event); -		} while (len > size); +		} while (len >= size);  		/* update bpage */  		local_set(&bpage->commit, pos); @@ -3878,6 +4543,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  	} else {  		/* update the entry counter */  		cpu_buffer->read += rb_page_entries(reader); +		cpu_buffer->read_bytes += BUF_PAGE_SIZE;  		/* swap the pages */  		rb_init_page(bpage); @@ -3924,84 +4590,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,  		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);   out_unlock: -	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);   out:  	return ret;  }  EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_TRACING -static ssize_t -rb_simple_read(struct file *filp, char __user *ubuf, -	       size_t cnt, loff_t *ppos) -{ -	unsigned long *p = filp->private_data; -	char buf[64]; -	int r; - -	if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) -		r = sprintf(buf, "permanently disabled\n"); -	else -		r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); - -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -rb_simple_write(struct file *filp, const char __user *ubuf, -		size_t cnt, loff_t *ppos) -{ -	unsigned long *p = filp->private_data; -	char buf[64]; -	unsigned long val; -	int ret; - -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) -		return ret; - -	if (val) -		set_bit(RB_BUFFERS_ON_BIT, p); -	else -		clear_bit(RB_BUFFERS_ON_BIT, p); - -	(*ppos)++; - -	return cnt; -} - -static const struct file_operations rb_simple_fops = { -	.open		= tracing_open_generic, -	.read		= rb_simple_read, -	.write		= rb_simple_write, -	.llseek		= default_llseek, -}; - - -static __init int rb_init_debugfs(void) -{ -	struct dentry *d_tracer; - -	d_tracer = tracing_init_dentry(); - -	trace_create_file("tracing_on", 0644, d_tracer, -			    &ring_buffer_flags, &rb_simple_fops); - -	return 0; -} - -fs_initcall(rb_init_debugfs); -#endif -  #ifdef CONFIG_HOTPLUG_CPU  static int rb_cpu_notify(struct notifier_block *self,  			 unsigned long action, void *hcpu) @@ -4009,6 +4604,8 @@ static int rb_cpu_notify(struct notifier_block *self,  	struct ring_buffer *buffer =  		container_of(self, struct ring_buffer, cpu_notify);  	long cpu = (long)hcpu; +	int cpu_i, nr_pages_same; +	unsigned int nr_pages;  	switch (action) {  	case CPU_UP_PREPARE: @@ -4016,8 +4613,23 @@ static int rb_cpu_notify(struct notifier_block *self,  		if (cpumask_test_cpu(cpu, buffer->cpumask))  			return NOTIFY_OK; +		nr_pages = 0; +		nr_pages_same = 1; +		/* check if all cpu sizes are same */ +		for_each_buffer_cpu(buffer, cpu_i) { +			/* fill in the size from first enabled cpu */ +			if (nr_pages == 0) +				nr_pages = buffer->buffers[cpu_i]->nr_pages; +			if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { +				nr_pages_same = 0; +				break; +			} +		} +		/* allocate minimum pages, user can later expand it */ +		if (!nr_pages_same) +			nr_pages = 2;  		buffer->buffers[cpu] = -			rb_allocate_cpu_buffer(buffer, cpu); +			rb_allocate_cpu_buffer(buffer, nr_pages, cpu);  		if (!buffer->buffers[cpu]) {  			WARN(1, "failed to allocate ring buffer on CPU %ld\n",  			     cpu); @@ -4040,3 +4652,320 @@ static int rb_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  }  #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { +	struct ring_buffer	*buffer; +	unsigned long		events; +	unsigned long		bytes_written; +	unsigned long		bytes_alloc; +	unsigned long		bytes_dropped; +	unsigned long		events_nested; +	unsigned long		bytes_written_nested; +	unsigned long		bytes_alloc_nested; +	unsigned long		bytes_dropped_nested; +	int			min_size_nested; +	int			max_size_nested; +	int			max_size; +	int			min_size; +	int			cpu; +	int			cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE	1048576 + +static char rb_string[] __initdata = +	"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" +	"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" +	"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { +	int size; +	char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ +	struct ring_buffer_event *event; +	struct rb_item *item; +	bool started; +	int event_len; +	int size; +	int len; +	int cnt; + +	/* Have nested writes different that what is written */ +	cnt = data->cnt + (nested ? 27 : 0); + +	/* Multiply cnt by ~e, to make some unique increment */ +	size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + +	len = size + sizeof(struct rb_item); + +	started = rb_test_started; +	/* read rb_test_started before checking buffer enabled */ +	smp_rmb(); + +	event = ring_buffer_lock_reserve(data->buffer, len); +	if (!event) { +		/* Ignore dropped events before test starts. */ +		if (started) { +			if (nested) +				data->bytes_dropped += len; +			else +				data->bytes_dropped_nested += len; +		} +		return len; +	} + +	event_len = ring_buffer_event_length(event); + +	if (RB_WARN_ON(data->buffer, event_len < len)) +		goto out; + +	item = ring_buffer_event_data(event); +	item->size = size; +	memcpy(item->str, rb_string, size); + +	if (nested) { +		data->bytes_alloc_nested += event_len; +		data->bytes_written_nested += len; +		data->events_nested++; +		if (!data->min_size_nested || len < data->min_size_nested) +			data->min_size_nested = len; +		if (len > data->max_size_nested) +			data->max_size_nested = len; +	} else { +		data->bytes_alloc += event_len; +		data->bytes_written += len; +		data->events++; +		if (!data->min_size || len < data->min_size) +			data->max_size = len; +		if (len > data->max_size) +			data->max_size = len; +	} + + out: +	ring_buffer_unlock_commit(data->buffer, event); + +	return 0; +} + +static __init int rb_test(void *arg) +{ +	struct rb_test_data *data = arg; + +	while (!kthread_should_stop()) { +		rb_write_something(data, false); +		data->cnt++; + +		set_current_state(TASK_INTERRUPTIBLE); +		/* Now sleep between a min of 100-300us and a max of 1ms */ +		usleep_range(((data->cnt % 3) + 1) * 100, 1000); +	} + +	return 0; +} + +static __init void rb_ipi(void *ignore) +{ +	struct rb_test_data *data; +	int cpu = smp_processor_id(); + +	data = &rb_data[cpu]; +	rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ +	while (!kthread_should_stop()) { + +		/* Send an IPI to all cpus to write data! */ +		smp_call_function(rb_ipi, NULL, 1); +		/* No sleep, but for non preempt, let others run */ +		schedule(); +	} + +	return 0; +} + +static __init int test_ringbuffer(void) +{ +	struct task_struct *rb_hammer; +	struct ring_buffer *buffer; +	int cpu; +	int ret = 0; + +	pr_info("Running ring buffer tests...\n"); + +	buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); +	if (WARN_ON(!buffer)) +		return 0; + +	/* Disable buffer so that threads can't write to it yet */ +	ring_buffer_record_off(buffer); + +	for_each_online_cpu(cpu) { +		rb_data[cpu].buffer = buffer; +		rb_data[cpu].cpu = cpu; +		rb_data[cpu].cnt = cpu; +		rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], +						 "rbtester/%d", cpu); +		if (WARN_ON(!rb_threads[cpu])) { +			pr_cont("FAILED\n"); +			ret = -1; +			goto out_free; +		} + +		kthread_bind(rb_threads[cpu], cpu); + 		wake_up_process(rb_threads[cpu]); +	} + +	/* Now create the rb hammer! */ +	rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); +	if (WARN_ON(!rb_hammer)) { +		pr_cont("FAILED\n"); +		ret = -1; +		goto out_free; +	} + +	ring_buffer_record_on(buffer); +	/* +	 * Show buffer is enabled before setting rb_test_started. +	 * Yes there's a small race window where events could be +	 * dropped and the thread wont catch it. But when a ring +	 * buffer gets enabled, there will always be some kind of +	 * delay before other CPUs see it. Thus, we don't care about +	 * those dropped events. We care about events dropped after +	 * the threads see that the buffer is active. +	 */ +	smp_wmb(); +	rb_test_started = true; + +	set_current_state(TASK_INTERRUPTIBLE); +	/* Just run for 10 seconds */; +	schedule_timeout(10 * HZ); + +	kthread_stop(rb_hammer); + + out_free: +	for_each_online_cpu(cpu) { +		if (!rb_threads[cpu]) +			break; +		kthread_stop(rb_threads[cpu]); +	} +	if (ret) { +		ring_buffer_free(buffer); +		return ret; +	} + +	/* Report! */ +	pr_info("finished\n"); +	for_each_online_cpu(cpu) { +		struct ring_buffer_event *event; +		struct rb_test_data *data = &rb_data[cpu]; +		struct rb_item *item; +		unsigned long total_events; +		unsigned long total_dropped; +		unsigned long total_written; +		unsigned long total_alloc; +		unsigned long total_read = 0; +		unsigned long total_size = 0; +		unsigned long total_len = 0; +		unsigned long total_lost = 0; +		unsigned long lost; +		int big_event_size; +		int small_event_size; + +		ret = -1; + +		total_events = data->events + data->events_nested; +		total_written = data->bytes_written + data->bytes_written_nested; +		total_alloc = data->bytes_alloc + data->bytes_alloc_nested; +		total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + +		big_event_size = data->max_size + data->max_size_nested; +		small_event_size = data->min_size + data->min_size_nested; + +		pr_info("CPU %d:\n", cpu); +		pr_info("              events:    %ld\n", total_events); +		pr_info("       dropped bytes:    %ld\n", total_dropped); +		pr_info("       alloced bytes:    %ld\n", total_alloc); +		pr_info("       written bytes:    %ld\n", total_written); +		pr_info("       biggest event:    %d\n", big_event_size); +		pr_info("      smallest event:    %d\n", small_event_size); + +		if (RB_WARN_ON(buffer, total_dropped)) +			break; + +		ret = 0; + +		while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { +			total_lost += lost; +			item = ring_buffer_event_data(event); +			total_len += ring_buffer_event_length(event); +			total_size += item->size + sizeof(struct rb_item); +			if (memcmp(&item->str[0], rb_string, item->size) != 0) { +				pr_info("FAILED!\n"); +				pr_info("buffer had: %.*s\n", item->size, item->str); +				pr_info("expected:   %.*s\n", item->size, rb_string); +				RB_WARN_ON(buffer, 1); +				ret = -1; +				break; +			} +			total_read++; +		} +		if (ret) +			break; + +		ret = -1; + +		pr_info("         read events:   %ld\n", total_read); +		pr_info("         lost events:   %ld\n", total_lost); +		pr_info("        total events:   %ld\n", total_lost + total_read); +		pr_info("  recorded len bytes:   %ld\n", total_len); +		pr_info(" recorded size bytes:   %ld\n", total_size); +		if (total_lost) +			pr_info(" With dropped events, record len and size may not match\n" +				" alloced and written from above\n"); +		if (!total_lost) { +			if (RB_WARN_ON(buffer, total_len != total_alloc || +				       total_size != total_written)) +				break; +		} +		if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) +			break; + +		ret = 0; +	} +	if (!ret) +		pr_info("Ring buffer PASSED!\n"); + +	ring_buffer_free(buffer); +	return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a61463..0434ff1b808 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -40,8 +40,8 @@ static int write_iteration = 50;  module_param(write_iteration, uint, 0644);  MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); -static int producer_nice = 19; -static int consumer_nice = 19; +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE;  static int producer_fifo = -1;  static int consumer_fifo = -1; @@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)  	int inc;  	int i; -	bpage = ring_buffer_alloc_read_page(buffer); +	bpage = ring_buffer_alloc_read_page(buffer, cpu);  	if (!bpage)  		return EVENT_DROPPED; @@ -308,7 +308,7 @@ static void ring_buffer_producer(void)  	/* Let the user know that the test is running at low priority */  	if (producer_fifo < 0 && consumer_fifo < 0 && -	    producer_nice == 19 && consumer_nice == 19) +	    producer_nice == MAX_NICE && consumer_nice == MAX_NICE)  		trace_printk("WARNING!!! This test is running at lowest priority.\n");  	trace_printk("Time:     %lld (usecs)\n", time); diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 00000000000..4b3b5eaf94d --- /dev/null +++ b/kernel/trace/rpm-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/usb.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rpm.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82d9b8106cd..291397e6666 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@  /*   * ring buffer based function tracer   * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>   * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>   *   * Originally taken from the RT patch by: @@ -9,7 +9,7 @@   *   * Based on code from the latency_tracer, that is:   *  Copyright (C) 2004-2006 Ingo Molnar - *  Copyright (C) 2004 William Lee Irwin III + *  Copyright (C) 2004 Nadia Yvette Chambers   */  #include <linux/ring_buffer.h>  #include <generated/utsrelease.h> @@ -17,7 +17,6 @@  #include <linux/writeback.h>  #include <linux/kallsyms.h>  #include <linux/seq_file.h> -#include <linux/smp_lock.h>  #include <linux/notifier.h>  #include <linux/irqflags.h>  #include <linux/debugfs.h> @@ -37,18 +36,18 @@  #include <linux/ctype.h>  #include <linux/init.h>  #include <linux/poll.h> +#include <linux/nmi.h>  #include <linux/fs.h> +#include <linux/sched/rt.h>  #include "trace.h"  #include "trace_output.h" -#define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE) -  /*   * On boot up, the ring buffer is set to the minimum size, so that   * we do not waste memory on systems that are not using tracing.   */ -int ring_buffer_expanded; +bool ring_buffer_expanded;  /*   * We need to change this state when a selftest is running. @@ -74,12 +73,20 @@ static struct tracer_flags dummy_tracer_flags = {  	.opts = dummy_tracer_opt  }; -static int dummy_set_flag(u32 old_flags, u32 bit, int set) +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	return 0;  }  /* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_cmdline_save); + +/*   * Kill all tracing for good (never come back).   * It is initialized to 1 but will turn to zero if the initialization   * of the tracer is successful. But that is the only place that sets @@ -89,18 +96,6 @@ static int tracing_disabled = 1;  DEFINE_PER_CPU(int, ftrace_cpu_disabled); -static inline void ftrace_disable_cpu(void) -{ -	preempt_disable(); -	__this_cpu_inc(ftrace_cpu_disabled); -} - -static inline void ftrace_enable_cpu(void) -{ -	__this_cpu_dec(ftrace_cpu_disabled); -	preempt_enable(); -} -  cpumask_var_t __read_mostly	tracing_buffer_mask;  /* @@ -121,18 +116,23 @@ cpumask_var_t __read_mostly	tracing_buffer_mask;  enum ftrace_dump_mode ftrace_dump_on_oops; -static int tracing_set_tracer(const char *buf); +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + +static int tracing_set_tracer(struct trace_array *tr, const char *buf);  #define MAX_TRACER_SIZE		100  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;  static char *default_bootup_tracer; +static bool allocate_snapshot; +  static int __init set_cmdline_ftrace(char *str)  { -	strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); +	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);  	default_bootup_tracer = bootup_tracer_buf;  	/* We are using ftrace early, expand it */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true;  	return 1;  }  __setup("ftrace=", set_cmdline_ftrace); @@ -153,6 +153,46 @@ static int __init set_ftrace_dump_on_oops(char *str)  }  __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init stop_trace_on_warning(char *str) +{ +	__disable_trace_on_warning = 1; +	return 1; +} +__setup("traceoff_on_warning=", stop_trace_on_warning); + +static int __init boot_alloc_snapshot(char *str) +{ +	allocate_snapshot = true; +	/* We also need the main ring buffer expanded */ +	ring_buffer_expanded = true; +	return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_options __initdata; + +static int __init set_trace_boot_options(char *str) +{ +	strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); +	trace_boot_options = trace_boot_options_buf; +	return 0; +} +__setup("trace_options=", set_trace_boot_options); + +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ +	strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); +	trace_boot_clock = trace_boot_clock_buf; +	return 0; +} +__setup("trace_clock=", set_trace_boot_clock); + +  unsigned long long ns2usecs(cycle_t nsec)  {  	nsec += 500; @@ -174,58 +214,104 @@ unsigned long long ns2usecs(cycle_t nsec)   */  static struct trace_array	global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays); -int filter_current_check_discard(struct ring_buffer *buffer, -				 struct ftrace_event_call *call, void *rec, -				 struct ring_buffer_event *event) +int trace_array_get(struct trace_array *this_tr)  { -	return filter_check_discard(call, rec, buffer, event); +	struct trace_array *tr; +	int ret = -ENODEV; + +	mutex_lock(&trace_types_lock); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr == this_tr) { +			tr->ref++; +			ret = 0; +			break; +		} +	} +	mutex_unlock(&trace_types_lock); + +	return ret;  } -EXPORT_SYMBOL_GPL(filter_current_check_discard); -cycle_t ftrace_now(int cpu) +static void __trace_array_put(struct trace_array *this_tr) +{ +	WARN_ON(!this_tr->ref); +	this_tr->ref--; +} + +void trace_array_put(struct trace_array *this_tr) +{ +	mutex_lock(&trace_types_lock); +	__trace_array_put(this_tr); +	mutex_unlock(&trace_types_lock); +} + +int filter_check_discard(struct ftrace_event_file *file, void *rec, +			 struct ring_buffer *buffer, +			 struct ring_buffer_event *event) +{ +	if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && +	    !filter_match_preds(file->filter, rec)) { +		ring_buffer_discard_commit(buffer, event); +		return 1; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, +			      struct ring_buffer *buffer, +			      struct ring_buffer_event *event) +{ +	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && +	    !filter_match_preds(call->filter, rec)) { +		ring_buffer_discard_commit(buffer, event); +		return 1; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(call_filter_check_discard); + +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)  {  	u64 ts;  	/* Early boot up does not have a buffer yet */ -	if (!global_trace.buffer) +	if (!buf->buffer)  		return trace_clock_local(); -	ts = ring_buffer_time_stamp(global_trace.buffer, cpu); -	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); +	ts = ring_buffer_time_stamp(buf->buffer, cpu); +	ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);  	return ts;  } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array	max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); - -/* tracer_enabled is used to toggle activation of a tracer */ -static int			tracer_enabled = 1; +cycle_t ftrace_now(int cpu) +{ +	return buffer_ftrace_now(&global_trace.trace_buffer, cpu); +}  /** - * tracing_is_enabled - return tracer_enabled status + * tracing_is_enabled - Show if global_trace has been disabled   * - * This function is used by other tracers to know the status - * of the tracer_enabled flag.  Tracers may use this function - * to know if it should enable their features when starting - * up. See irqsoff tracer for an example (start_irqsoff_tracer). + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate.   */  int tracing_is_enabled(void)  { -	return tracer_enabled; +	/* +	 * For quick access (irqsoff uses this in fast path), just +	 * return the mirror variable of the state of the ring buffer. +	 * It's a little racy, but we don't really care. +	 */ +	smp_rmb(); +	return !global_trace.buffer_disabled;  }  /* @@ -245,13 +331,10 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;  /* trace_types holds a link list of available tracers. */  static struct tracer		*trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer		*current_trace __read_mostly; -  /*   * trace_types_lock is used to protect the trace_types list.   */ -static DEFINE_MUTEX(trace_types_lock); +DEFINE_MUTEX(trace_types_lock);  /*   * serialize the access of the ring buffer @@ -281,13 +364,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);  static inline void trace_access_lock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		/* gain it for accessing the whole ring buffer. */  		down_write(&all_cpu_access_lock);  	} else {  		/* gain it for accessing a cpu ring buffer. */ -		/* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ +		/* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */  		down_read(&all_cpu_access_lock);  		/* Secondly block other access to this @cpu ring buffer. */ @@ -297,7 +380,7 @@ static inline void trace_access_lock(int cpu)  static inline void trace_access_unlock(int cpu)  { -	if (cpu == TRACE_PIPE_ALL_CPU) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		up_write(&all_cpu_access_lock);  	} else {  		mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -335,39 +418,337 @@ static inline void trace_access_lock_init(void)  #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); -  /* trace_flags holds trace_options default values */  unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |  	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | -	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; +	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | +	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; -static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static void tracer_tracing_on(struct trace_array *tr) +{ +	if (tr->trace_buffer.buffer) +		ring_buffer_record_on(tr->trace_buffer.buffer); +	/* +	 * This flag is looked at when buffers haven't been allocated +	 * yet, or by some tracers (like irqsoff), that just want to +	 * know if the ring buffer has been disabled, but it can handle +	 * races of where it gets disabled but we still do a record. +	 * As the check is in the fast path of the tracers, it is more +	 * important to be fast than accurate. +	 */ +	tr->buffer_disabled = 0; +	/* Make the flag seen by readers */ +	smp_wmb(); +}  /** - * trace_wake_up - wake up tasks waiting for trace input + * tracing_on - enable tracing buffers   * - * Simply wakes up any task that is blocked on the trace_wait - * queue. These is used with trace_poll for tasks polling the trace. + * This function enables tracing buffers that may have been + * disabled with tracing_off.   */ -void trace_wake_up(void) +void tracing_on(void)  { -	int cpu; +	tracer_tracing_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_on); -	if (trace_flags & TRACE_ITER_BLOCK) +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip:	   The address of the caller + * @str:   The constant string to write + * @size:  The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct print_entry *entry; +	unsigned long irq_flags; +	int alloc; +	int pc; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	pc = preempt_count(); + +	if (unlikely(tracing_selftest_running || tracing_disabled)) +		return 0; + +	alloc = sizeof(*entry) + size + 2; /* possible \n added */ + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,  +					  irq_flags, pc); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip = ip; + +	memcpy(&entry->buf, str, size); + +	/* Add a newline if necessary */ +	if (entry->buf[size - 1] != '\n') { +		entry->buf[size] = '\n'; +		entry->buf[size + 1] = '\0'; +	} else +		entry->buf[size] = '\0'; + +	__buffer_unlock_commit(buffer, event); +	ftrace_trace_stack(buffer, irq_flags, 4, pc); + +	return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip:	   The address of the caller + * @str:   The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct bputs_entry *entry; +	unsigned long irq_flags; +	int size = sizeof(struct bputs_entry); +	int pc; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	pc = preempt_count(); + +	if (unlikely(tracing_selftest_running || tracing_disabled)) +		return 0; + +	local_save_flags(irq_flags); +	buffer = global_trace.trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, +					  irq_flags, pc); +	if (!event) +		return 0; + +	entry = ring_buffer_event_data(event); +	entry->ip			= ip; +	entry->str			= str; + +	__buffer_unlock_commit(buffer, event); +	ftrace_trace_stack(buffer, irq_flags, 4, pc); + +	return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *tracer = tr->current_trace; +	unsigned long flags; + +	if (in_nmi()) { +		internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); +		internal_trace_puts("*** snapshot is being ignored        ***\n");  		return; +	} + +	if (!tr->allocated_snapshot) { +		internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); +		internal_trace_puts("*** stopping trace here!   ***\n"); +		tracing_off(); +		return; +	} + +	/* Note, snapshot can not be used when the tracer uses it */ +	if (tracer->use_max_tr) { +		internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); +		internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); +		return; +	} + +	local_irq_save(flags); +	update_max_tr(tr, current, smp_processor_id()); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ +	int ret; + +	if (!tr->allocated_snapshot) { + +		/* allocate spare buffer */ +		ret = resize_buffer_duplicate_size(&tr->max_buffer, +				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS); +		if (ret < 0) +			return ret; + +		tr->allocated_snapshot = true; +	} + +	return 0; +} + +static void free_snapshot(struct trace_array *tr) +{ +	/* +	 * We don't free the ring buffer. instead, resize it because +	 * The max_tr ring buffer has some state (e.g. ring->clock) and +	 * we want preserve it. +	 */ +	ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); +	set_buffer_entries(&tr->max_buffer, 1); +	tracing_reset_online_cpus(&tr->max_buffer); +	tr->allocated_snapshot = false; +} + +/** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ +	struct trace_array *tr = &global_trace; +	int ret; + +	ret = alloc_snapshot(tr); +	WARN_ON(ret < 0); + +	return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ +	int ret; + +	ret = tracing_alloc_snapshot(); +	if (ret < 0) +		return; + +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ +	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +int tracing_alloc_snapshot(void) +{ +	WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); +	return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); +void tracing_snapshot_alloc(void) +{ +	/* Give warning */ +	tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void tracer_tracing_off(struct trace_array *tr) +{ +	if (tr->trace_buffer.buffer) +		ring_buffer_record_off(tr->trace_buffer.buffer);  	/* -	 * The runqueue_is_locked() can fail, but this is the best we -	 * have for now: +	 * This flag is looked at when buffers haven't been allocated +	 * yet, or by some tracers (like irqsoff), that just want to +	 * know if the ring buffer has been disabled, but it can handle +	 * races of where it gets disabled but we still do a record. +	 * As the check is in the fast path of the tracers, it is more +	 * important to be fast than accurate.  	 */ -	cpu = get_cpu(); -	if (!runqueue_is_locked(cpu)) -		wake_up(&trace_wait); -	put_cpu(); +	tr->buffer_disabled = 1; +	/* Make the flag seen by readers */ +	smp_wmb();  } +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ +	tracer_tracing_off(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_off); + +void disable_trace_on_warning(void) +{ +	if (__disable_trace_on_warning) +		tracing_off(); +} + +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +static int tracer_tracing_is_on(struct trace_array *tr) +{ +	if (tr->trace_buffer.buffer) +		return ring_buffer_record_is_on(tr->trace_buffer.buffer); +	return !tr->buffer_disabled; +} + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ +	return tracer_tracing_is_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_is_on); +  static int __init set_buf_size(char *str)  {  	unsigned long buf_size; @@ -385,15 +766,15 @@ __setup("trace_buf_size=", set_buf_size);  static int __init set_tracing_thresh(char *str)  { -	unsigned long threshhold; +	unsigned long threshold;  	int ret;  	if (!str)  		return 0; -	ret = strict_strtoul(str, 0, &threshhold); +	ret = kstrtoul(str, 0, &threshold);  	if (ret < 0)  		return 0; -	tracing_thresh = threshhold * 1000; +	tracing_thresh = threshold * 1000;  	return 1;  }  __setup("tracing_thresh=", set_tracing_thresh); @@ -426,19 +807,27 @@ static const char *trace_options[] = {  	"sleep-time",  	"graph-time",  	"record-cmd", +	"overwrite", +	"disable_on_free", +	"irq-info", +	"markers", +	"function-trace",  	NULL  };  static struct {  	u64 (*func)(void);  	const char *name; +	int in_ns;		/* is this clock in nanoseconds? */  } trace_clocks[] = { -	{ trace_clock_local,	"local" }, -	{ trace_clock_global,	"global" }, +	{ trace_clock_local,	"local",	1 }, +	{ trace_clock_global,	"global",	1 }, +	{ trace_clock_counter,	"counter",	0 }, +	{ trace_clock_jiffies,	"uptime",	0 }, +	{ trace_clock,		"perf",		1 }, +	ARCH_TRACE_CLOCKS  }; -int trace_clock_id; -  /*   * trace_parser_get_init - gets the buffer for trace parser   */ @@ -533,9 +922,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,  	if (isspace(ch)) {  		parser->buffer[parser->idx] = 0;  		parser->cont = false; -	} else { +	} else if (parser->idx < parser->size - 1) {  		parser->cont = true;  		parser->buffer[parser->idx++] = ch; +	} else { +		ret = -EINVAL; +		goto out;  	}  	*ppos += read; @@ -572,7 +964,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)  static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  {  	int len; -	void *ret;  	if (s->len <= s->readpos)  		return -EBUSY; @@ -580,35 +971,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  	len = s->len - s->readpos;  	if (cnt > len)  		cnt = len; -	ret = memcpy(buf, s->buffer + s->readpos, cnt); -	if (!ret) -		return -EFAULT; +	memcpy(buf, s->buffer + s->readpos, cnt);  	s->readpos += cnt;  	return cnt;  } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = -	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -  unsigned long __read_mostly	tracing_thresh;  #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly	tracing_max_latency; -  /*   * Copy the new maximum trace into the separate maximum-trace   * structure. (this way the maximum trace is permanently saved, @@ -617,20 +988,29 @@ unsigned long __read_mostly	tracing_max_latency;  static void  __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct trace_array_cpu *data = tr->data[cpu]; -	struct trace_array_cpu *max_data; +	struct trace_buffer *trace_buf = &tr->trace_buffer; +	struct trace_buffer *max_buf = &tr->max_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); +	struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); -	max_tr.cpu = cpu; -	max_tr.time_start = data->preempt_timestamp; +	max_buf->cpu = cpu; +	max_buf->time_start = data->preempt_timestamp; -	max_data = max_tr.data[cpu]; -	max_data->saved_latency = tracing_max_latency; +	max_data->saved_latency = tr->max_latency;  	max_data->critical_start = data->critical_start;  	max_data->critical_end = data->critical_end;  	memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);  	max_data->pid = tsk->pid; -	max_data->uid = task_uid(tsk); +	/* +	 * If tsk == current, then use current_uid(), as that does not use +	 * RCU. The irq tracer can be called out of RCU scope. +	 */ +	if (tsk == current) +		max_data->uid = current_uid(); +	else +		max_data->uid = task_uid(tsk); +  	max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;  	max_data->policy = tsk->policy;  	max_data->rt_priority = tsk->rt_priority; @@ -651,23 +1031,27 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  void  update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)  { -	struct ring_buffer *buf = tr->buffer; +	struct ring_buffer *buf; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->use_max_tr) { -		WARN_ON_ONCE(1); + +	if (!tr->allocated_snapshot) { +		/* Only the nop tracer should hit this when disabling */ +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return;  	} -	arch_spin_lock(&ftrace_max_lock); -	tr->buffer = max_tr.buffer; -	max_tr.buffer = buf; +	arch_spin_lock(&tr->max_lock); + +	buf = tr->trace_buffer.buffer; +	tr->trace_buffer.buffer = tr->max_buffer.buffer; +	tr->max_buffer.buffer = buf;  	__update_max_tr(tr, tsk, cpu); -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&tr->max_lock);  }  /** @@ -683,20 +1067,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  {  	int ret; -	if (trace_stop_count) +	if (tr->stop_count)  		return;  	WARN_ON_ONCE(!irqs_disabled()); -	if (!current_trace->use_max_tr) { -		WARN_ON_ONCE(1); +	if (!tr->allocated_snapshot) { +		/* Only the nop tracer should hit this when disabling */ +		WARN_ON_ONCE(tr->current_trace != &nop_trace);  		return;  	} -	arch_spin_lock(&ftrace_max_lock); - -	ftrace_disable_cpu(); +	arch_spin_lock(&tr->max_lock); -	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); +	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);  	if (ret == -EBUSY) {  		/* @@ -705,19 +1088,92 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)  		 * the max trace buffer (no one writes directly to it)  		 * and flag that it failed.  		 */ -		trace_array_printk(&max_tr, _THIS_IP_, +		trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,  			"Failed to swap buffers due to commit in progress\n");  	} -	ftrace_enable_cpu(); -  	WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);  	__update_max_tr(tr, tsk, cpu); -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&tr->max_lock);  }  #endif /* CONFIG_TRACER_MAX_TRACE */ +static int wait_on_pipe(struct trace_iterator *iter) +{ +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return 0; + +	return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ +	struct trace_array *tr = &global_trace; +	struct tracer *saved_tracer = tr->current_trace; +	int ret; + +	if (!type->selftest || tracing_selftest_disabled) +		return 0; + +	/* +	 * Run a selftest on this tracer. +	 * Here we reset the trace buffer, and set the current +	 * tracer to be this tracer. The tracer can then run some +	 * internal tracing to verify that everything is in order. +	 * If we fail, we do not register this tracer. +	 */ +	tracing_reset_online_cpus(&tr->trace_buffer); + +	tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		/* If we expanded the buffers, make sure the max is expanded too */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, +					   RING_BUFFER_ALL_CPUS); +		tr->allocated_snapshot = true; +	} +#endif + +	/* the test is responsible for initializing and enabling */ +	pr_info("Testing tracer %s: ", type->name); +	ret = type->selftest(type, tr); +	/* the test is responsible for resetting too */ +	tr->current_trace = saved_tracer; +	if (ret) { +		printk(KERN_CONT "FAILED!\n"); +		/* Add the warning after printing 'FAILED' */ +		WARN_ON(1); +		return -1; +	} +	/* Only reset on passing, to avoid touching corrupted buffers */ +	tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (type->use_max_tr) { +		tr->allocated_snapshot = false; + +		/* Shrink the max buffer again */ +		if (ring_buffer_expanded) +			ring_buffer_resize(tr->max_buffer.buffer, 1, +					   RING_BUFFER_ALL_CPUS); +	} +#endif + +	printk(KERN_CONT "PASSED\n"); +	return 0; +} +#else +static inline int run_tracer_selftest(struct tracer *type) +{ +	return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */ +  /**   * register_tracer - register a tracer with the ftrace system.   * @type - the plugin for the tracer @@ -725,8 +1181,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)   * Register a new plugin tracer.   */  int register_tracer(struct tracer *type) -__releases(kernel_lock) -__acquires(kernel_lock)  {  	struct tracer *t;  	int ret = 0; @@ -762,40 +1216,10 @@ __acquires(kernel_lock)  	else  		if (!type->flags->opts)  			type->flags->opts = dummy_tracer_opt; -	if (!type->wait_pipe) -		type->wait_pipe = default_wait_pipe; - - -#ifdef CONFIG_FTRACE_STARTUP_TEST -	if (type->selftest && !tracing_selftest_disabled) { -		struct tracer *saved_tracer = current_trace; -		struct trace_array *tr = &global_trace; -		/* -		 * Run a selftest on this tracer. -		 * Here we reset the trace buffer, and set the current -		 * tracer to be this tracer. The tracer can then run some -		 * internal tracing to verify that everything is in order. -		 * If we fail, we do not register this tracer. -		 */ -		tracing_reset_online_cpus(tr); - -		current_trace = type; -		/* the test is responsible for initializing and enabling */ -		pr_info("Testing tracer %s: ", type->name); -		ret = type->selftest(type, tr); -		/* the test is responsible for resetting too */ -		current_trace = saved_tracer; -		if (ret) { -			printk(KERN_CONT "FAILED!\n"); -			goto out; -		} -		/* Only reset on passing, to avoid touching corrupted buffers */ -		tracing_reset_online_cpus(tr); - -		printk(KERN_CONT "PASSED\n"); -	} -#endif +	ret = run_tracer_selftest(type); +	if (ret < 0) +		goto out;  	type->next = trace_types;  	trace_types = type; @@ -812,10 +1236,10 @@ __acquires(kernel_lock)  	printk(KERN_INFO "Starting tracer '%s'\n", type->name);  	/* Do we want this tracer to start on bootup? */ -	tracing_set_tracer(type->name); +	tracing_set_tracer(&global_trace, type->name);  	default_bootup_tracer = NULL;  	/* disable other selftests, since this will break it. */ -	tracing_selftest_disabled = 1; +	tracing_selftest_disabled = true;  #ifdef CONFIG_FTRACE_STARTUP_TEST  	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",  	       type->name); @@ -825,116 +1249,126 @@ __acquires(kernel_lock)  	return ret;  } -void unregister_tracer(struct tracer *type) +void tracing_reset(struct trace_buffer *buf, int cpu)  { -	struct tracer **t; +	struct ring_buffer *buffer = buf->buffer; -	mutex_lock(&trace_types_lock); -	for (t = &trace_types; *t; t = &(*t)->next) { -		if (*t == type) -			goto found; -	} -	pr_info("Tracer %s not registered\n", type->name); -	goto out; - - found: -	*t = (*t)->next; - -	if (type == current_trace && tracer_enabled) { -		tracer_enabled = 0; -		tracing_stop(); -		if (current_trace->stop) -			current_trace->stop(&global_trace); -		current_trace = &nop_trace; -	} -out: -	mutex_unlock(&trace_types_lock); -} - -static void __tracing_reset(struct ring_buffer *buffer, int cpu) -{ -	ftrace_disable_cpu(); -	ring_buffer_reset_cpu(buffer, cpu); -	ftrace_enable_cpu(); -} - -void tracing_reset(struct trace_array *tr, int cpu) -{ -	struct ring_buffer *buffer = tr->buffer; +	if (!buffer) +		return;  	ring_buffer_record_disable(buffer);  	/* Make sure all commits have finished */  	synchronize_sched(); -	__tracing_reset(buffer, cpu); +	ring_buffer_reset_cpu(buffer, cpu);  	ring_buffer_record_enable(buffer);  } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf)  { -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = buf->buffer;  	int cpu; +	if (!buffer) +		return; +  	ring_buffer_record_disable(buffer);  	/* Make sure all commits have finished */  	synchronize_sched(); -	tr->time_start = ftrace_now(tr->cpu); +	buf->time_start = buffer_ftrace_now(buf, buf->cpu);  	for_each_online_cpu(cpu) -		__tracing_reset(buffer, cpu); +		ring_buffer_reset_cpu(buffer, cpu);  	ring_buffer_record_enable(buffer);  } -void tracing_reset_current(int cpu) +/* Must have trace_types_lock held */ +void tracing_reset_all_online_cpus(void)  { -	tracing_reset(&global_trace, cpu); -} +	struct trace_array *tr; -void tracing_reset_current_online_cpus(void) -{ -	tracing_reset_online_cpus(&global_trace); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE +		tracing_reset_online_cpus(&tr->max_buffer); +#endif +	}  } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128  #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx;  static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { +	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +	unsigned *map_cmdline_to_pid; +	unsigned cmdline_num; +	int cmdline_idx; +	char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd;  /* temporary disable recording */  static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx)  { -	memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); -	memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); -	cmdline_idx = 0; +	return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];  } -int is_tracing_stopped(void) +static inline void set_cmdline(int idx, const char *cmdline)  { -	return trace_stop_count; +	memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);  } -/** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected.  This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) +static int allocate_cmdlines_buffer(unsigned int val, +				    struct saved_cmdlines_buffer *s)  { -	tracing_disabled = 1; -	ftrace_stop(); -	tracing_off_permanent(); +	s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), +					GFP_KERNEL); +	if (!s->map_cmdline_to_pid) +		return -ENOMEM; + +	s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); +	if (!s->saved_cmdlines) { +		kfree(s->map_cmdline_to_pid); +		return -ENOMEM; +	} + +	s->cmdline_idx = 0; +	s->cmdline_num = val; +	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, +	       sizeof(s->map_pid_to_cmdline)); +	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, +	       val * sizeof(*s->map_cmdline_to_pid)); + +	return 0; +} + +static int trace_create_savedcmd(void) +{ +	int ret; + +	savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); +	if (!savedcmd) +		return -ENOMEM; + +	ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); +	if (ret < 0) { +		kfree(savedcmd); +		savedcmd = NULL; +		return -ENOMEM; +	} + +	return 0; +} + +int is_tracing_stopped(void) +{ +	return global_trace.stop_count;  }  /** @@ -951,32 +1385,64 @@ void tracing_start(void)  	if (tracing_disabled)  		return; -	spin_lock_irqsave(&tracing_start_lock, flags); -	if (--trace_stop_count) { -		if (trace_stop_count < 0) { +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (--global_trace.stop_count) { +		if (global_trace.stop_count < 0) {  			/* Someone screwed up their debugging */  			WARN_ON_ONCE(1); -			trace_stop_count = 0; +			global_trace.stop_count = 0;  		}  		goto out;  	}  	/* Prevent the buffers from switching */ -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&global_trace.max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_enable(buffer); +#endif -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&global_trace.max_lock); -	ftrace_start();   out: -	spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	if (tracing_disabled) +		return; + +	/* If global, we need to also start the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_start(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); + +	if (--tr->stop_count) { +		if (tr->stop_count < 0) { +			/* Someone screwed up their debugging */ +			WARN_ON_ONCE(1); +			tr->stop_count = 0; +		} +		goto out; +	} + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_enable(buffer); + + out: +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  /** @@ -990,36 +1456,58 @@ void tracing_stop(void)  	struct ring_buffer *buffer;  	unsigned long flags; -	ftrace_stop(); -	spin_lock_irqsave(&tracing_start_lock, flags); -	if (trace_stop_count++) +	raw_spin_lock_irqsave(&global_trace.start_lock, flags); +	if (global_trace.stop_count++)  		goto out;  	/* Prevent the buffers from switching */ -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&global_trace.max_lock); -	buffer = global_trace.buffer; +	buffer = global_trace.trace_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); -	buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	buffer = global_trace.max_buffer.buffer;  	if (buffer)  		ring_buffer_record_disable(buffer); +#endif + +	arch_spin_unlock(&global_trace.max_lock); -	arch_spin_unlock(&ftrace_max_lock); + out: +	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ +	struct ring_buffer *buffer; +	unsigned long flags; + +	/* If global, we need to also stop the max tracer */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return tracing_stop(); + +	raw_spin_lock_irqsave(&tr->start_lock, flags); +	if (tr->stop_count++) +		goto out; + +	buffer = tr->trace_buffer.buffer; +	if (buffer) +		ring_buffer_record_disable(buffer);   out: -	spin_unlock_irqrestore(&tracing_start_lock, flags); +	raw_spin_unlock_irqrestore(&tr->start_lock, flags);  }  void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk)  {  	unsigned pid, idx;  	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) -		return; +		return 0;  	/*  	 * It's not the end of the world if we don't get @@ -1028,11 +1516,11 @@ static void trace_save_cmdline(struct task_struct *tsk)  	 * so if we miss here, then better luck next time.  	 */  	if (!arch_spin_trylock(&trace_cmdline_lock)) -		return; +		return 0; -	idx = map_pid_to_cmdline[tsk->pid]; +	idx = savedcmd->map_pid_to_cmdline[tsk->pid];  	if (idx == NO_CMDLINE_MAP) { -		idx = (cmdline_idx + 1) % SAVED_CMDLINES; +		idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;  		/*  		 * Check whether the cmdline buffer at idx has a pid @@ -1040,22 +1528,24 @@ static void trace_save_cmdline(struct task_struct *tsk)  		 * need to clear the map_pid_to_cmdline. Otherwise we  		 * would read the new comm for the old pid.  		 */ -		pid = map_cmdline_to_pid[idx]; +		pid = savedcmd->map_cmdline_to_pid[idx];  		if (pid != NO_CMDLINE_MAP) -			map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; +			savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; -		map_cmdline_to_pid[idx] = tsk->pid; -		map_pid_to_cmdline[tsk->pid] = idx; +		savedcmd->map_cmdline_to_pid[idx] = tsk->pid; +		savedcmd->map_pid_to_cmdline[tsk->pid] = idx; -		cmdline_idx = idx; +		savedcmd->cmdline_idx = idx;  	} -	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); +	set_cmdline(idx, tsk->comm);  	arch_spin_unlock(&trace_cmdline_lock); + +	return 1;  } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[])  {  	unsigned map; @@ -1074,13 +1564,19 @@ void trace_find_cmdline(int pid, char comm[])  		return;  	} -	preempt_disable(); -	arch_spin_lock(&trace_cmdline_lock); -	map = map_pid_to_cmdline[pid]; +	map = savedcmd->map_pid_to_cmdline[pid];  	if (map != NO_CMDLINE_MAP) -		strcpy(comm, saved_cmdlines[map]); +		strcpy(comm, get_saved_cmdlines(map));  	else  		strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ +	preempt_disable(); +	arch_spin_lock(&trace_cmdline_lock); + +	__trace_find_cmdline(pid, comm);  	arch_spin_unlock(&trace_cmdline_lock);  	preempt_enable(); @@ -1088,11 +1584,14 @@ void trace_find_cmdline(int pid, char comm[])  void tracing_record_cmdline(struct task_struct *tsk)  { -	if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || -	    !tracing_is_on()) +	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) +		return; + +	if (!__this_cpu_read(trace_cmdline_save))  		return; -	trace_save_cmdline(tsk); +	if (trace_save_cmdline(tsk)) +		__this_cpu_write(trace_cmdline_save, false);  }  void @@ -1103,7 +1602,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  	entry->preempt_count		= pc & 0xff;  	entry->pid			= (tsk) ? tsk->pid : 0; -	entry->lock_depth		= (tsk) ? tsk->lock_depth : 0;  	entry->flags =  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT  		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1112,7 +1610,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  #endif  		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |  		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | -		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | +		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);  }  EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1135,34 +1634,66 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,  	return event;  } +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ +	__this_cpu_write(trace_cmdline_save, true); +	ring_buffer_unlock_commit(buffer, event); +} +  static inline void  __trace_buffer_unlock_commit(struct ring_buffer *buffer,  			     struct ring_buffer_event *event, -			     unsigned long flags, int pc, -			     int wake) +			     unsigned long flags, int pc)  { -	ring_buffer_unlock_commit(buffer, event); +	__buffer_unlock_commit(buffer, event);  	ftrace_trace_stack(buffer, flags, 6, pc);  	ftrace_trace_userstack(buffer, flags, pc); - -	if (wake) -		trace_wake_up();  }  void trace_buffer_unlock_commit(struct ring_buffer *buffer,  				struct ring_buffer_event *event,  				unsigned long flags, int pc)  { -	__trace_buffer_unlock_commit(buffer, event, flags, pc, 1); +	__trace_buffer_unlock_commit(buffer, event, flags, pc);  } +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); + +static struct ring_buffer *temp_buffer; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, +			  struct ftrace_event_file *ftrace_file, +			  int type, unsigned long len, +			  unsigned long flags, int pc) +{ +	struct ring_buffer_event *entry; + +	*current_rb = ftrace_file->tr->trace_buffer.buffer; +	entry = trace_buffer_lock_reserve(*current_rb, +					 type, len, flags, pc); +	/* +	 * If tracing is off, but we have triggers enabled +	 * we still need to look at the event data. Use the temp_buffer +	 * to store the trace event for the tigger to use. It's recusive +	 * safe and will not be recorded anywhere. +	 */ +	if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { +		*current_rb = temp_buffer; +		entry = trace_buffer_lock_reserve(*current_rb, +						  type, len, flags, pc); +	} +	return entry; +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);  struct ring_buffer_event *  trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,  				  int type, unsigned long len,  				  unsigned long flags, int pc)  { -	*current_rb = global_trace.buffer; +	*current_rb = global_trace.trace_buffer.buffer;  	return trace_buffer_lock_reserve(*current_rb,  					 type, len, flags, pc);  } @@ -1172,17 +1703,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,  					struct ring_buffer_event *event,  					unsigned long flags, int pc)  { -	__trace_buffer_unlock_commit(buffer, event, flags, pc, 1); +	__trace_buffer_unlock_commit(buffer, event, flags, pc);  }  EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, -				       struct ring_buffer_event *event, -				       unsigned long flags, int pc) +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, +				     struct ring_buffer_event *event, +				     unsigned long flags, int pc, +				     struct pt_regs *regs)  { -	__trace_buffer_unlock_commit(buffer, event, flags, pc, 0); +	__buffer_unlock_commit(buffer, event); + +	ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); +	ftrace_trace_userstack(buffer, flags, pc);  } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);  void trace_current_buffer_discard_commit(struct ring_buffer *buffer,  					 struct ring_buffer_event *event) @@ -1197,7 +1732,7 @@ trace_function(struct trace_array *tr,  	       int pc)  {  	struct ftrace_event_call *call = &event_function; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ftrace_entry *entry; @@ -1213,44 +1748,108 @@ trace_function(struct trace_array *tr,  	entry->ip			= ip;  	entry->parent_ip		= parent_ip; -	if (!filter_check_discard(call, entry, buffer, event)) -		ring_buffer_unlock_commit(buffer, event); -} - -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, -       unsigned long ip, unsigned long parent_ip, unsigned long flags, -       int pc) -{ -	if (likely(!atomic_read(&data->disabled))) -		trace_function(tr, ip, parent_ip, flags, pc); +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event);  }  #ifdef CONFIG_STACKTRACE + +#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +struct ftrace_stack { +	unsigned long		calls[FTRACE_STACK_MAX_ENTRIES]; +}; + +static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); +  static void __ftrace_trace_stack(struct ring_buffer *buffer,  				 unsigned long flags, -				 int skip, int pc) +				 int skip, int pc, struct pt_regs *regs)  {  	struct ftrace_event_call *call = &event_kernel_stack;  	struct ring_buffer_event *event;  	struct stack_entry *entry;  	struct stack_trace trace; +	int use_stack; +	int size = FTRACE_STACK_ENTRIES; + +	trace.nr_entries	= 0; +	trace.skip		= skip; + +	/* +	 * Since events can happen in NMIs there's no safe way to +	 * use the per cpu ftrace_stacks. We reserve it and if an interrupt +	 * or NMI comes in, it will just have to use the default +	 * FTRACE_STACK_SIZE. +	 */ +	preempt_disable_notrace(); + +	use_stack = __this_cpu_inc_return(ftrace_stack_reserve); +	/* +	 * We don't need any atomic variables, just a barrier. +	 * If an interrupt comes in, we don't care, because it would +	 * have exited and put the counter back to what we want. +	 * We just need a barrier to keep gcc from moving things +	 * around. +	 */ +	barrier(); +	if (use_stack == 1) { +		trace.entries		= this_cpu_ptr(ftrace_stack.calls); +		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES; + +		if (regs) +			save_stack_trace_regs(regs, &trace); +		else +			save_stack_trace(&trace); + +		if (trace.nr_entries > size) +			size = trace.nr_entries; +	} else +		/* From now on, use_stack is a boolean */ +		use_stack = 0; + +	size *= sizeof(unsigned long);  	event = trace_buffer_lock_reserve(buffer, TRACE_STACK, -					  sizeof(*entry), flags, pc); +					  sizeof(*entry) + size, flags, pc);  	if (!event) -		return; -	entry	= ring_buffer_event_data(event); -	memset(&entry->caller, 0, sizeof(entry->caller)); +		goto out; +	entry = ring_buffer_event_data(event); -	trace.nr_entries	= 0; -	trace.max_entries	= FTRACE_STACK_ENTRIES; -	trace.skip		= skip; -	trace.entries		= entry->caller; +	memset(&entry->caller, 0, size); + +	if (use_stack) +		memcpy(&entry->caller, trace.entries, +		       trace.nr_entries * sizeof(unsigned long)); +	else { +		trace.max_entries	= FTRACE_STACK_ENTRIES; +		trace.entries		= entry->caller; +		if (regs) +			save_stack_trace_regs(regs, &trace); +		else +			save_stack_trace(&trace); +	} + +	entry->size = trace.nr_entries; + +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event); + + out: +	/* Again, don't let gcc optimize things here */ +	barrier(); +	__this_cpu_dec(ftrace_stack_reserve); +	preempt_enable_notrace(); -	save_stack_trace(&trace); -	if (!filter_check_discard(call, entry, buffer, event)) -		ring_buffer_unlock_commit(buffer, event); +} + +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, +			     int skip, int pc, struct pt_regs *regs) +{ +	if (!(trace_flags & TRACE_ITER_STACKTRACE)) +		return; + +	__ftrace_trace_stack(buffer, flags, skip, pc, regs);  }  void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, @@ -1259,19 +1858,20 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,  	if (!(trace_flags & TRACE_ITER_STACKTRACE))  		return; -	__ftrace_trace_stack(buffer, flags, skip, pc); +	__ftrace_trace_stack(buffer, flags, skip, pc, NULL);  }  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc)  { -	__ftrace_trace_stack(tr->buffer, flags, skip, pc); +	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);  }  /**   * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers)   */ -void trace_dump_stack(void) +void trace_dump_stack(int skip)  {  	unsigned long flags; @@ -1280,10 +1880,17 @@ void trace_dump_stack(void)  	local_save_flags(flags); -	/* skipping 3 traces, seems to get us at the caller of this function */ -	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +	/* +	 * Skip 3 more, seems to get us at the caller of +	 * this function. +	 */ +	skip += 3; +	__ftrace_trace_stack(global_trace.trace_buffer.buffer, +			     flags, skip, preempt_count(), NULL);  } +static DEFINE_PER_CPU(int, user_stack_count); +  void  ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  { @@ -1302,10 +1909,20 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	if (unlikely(in_nmi()))  		return; +	/* +	 * prevent recursion, since the user stack tracing may +	 * trigger other kernel events. +	 */ +	preempt_disable(); +	if (__this_cpu_read(user_stack_count)) +		goto out; + +	__this_cpu_inc(user_stack_count); +  	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,  					  sizeof(*entry), flags, pc);  	if (!event) -		return; +		goto out_drop_count;  	entry	= ring_buffer_event_data(event);  	entry->tgid		= current->tgid; @@ -1317,8 +1934,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	trace.entries		= entry->caller;  	save_stack_trace_user(&trace); -	if (!filter_check_discard(call, entry, buffer, event)) -		ring_buffer_unlock_commit(buffer, event); +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event); + + out_drop_count: +	__this_cpu_dec(user_stack_count); + out: +	preempt_enable();  }  #ifdef UNUSED @@ -1330,25 +1952,161 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)  #endif /* CONFIG_STACKTRACE */ +/* created for use with alloc_percpu */ +struct trace_buffer_struct { +	char buffer[TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct *trace_percpu_sirq_buffer; +static struct trace_buffer_struct *trace_percpu_irq_buffer; +static struct trace_buffer_struct *trace_percpu_nmi_buffer; + +/* + * The buffer used is dependent on the context. There is a per cpu + * buffer for normal context, softirq contex, hard irq context and + * for NMI context. Thise allows for lockless recording. + * + * Note, if the buffers failed to be allocated, then this returns NULL + */ +static char *get_trace_buf(void) +{ +	struct trace_buffer_struct *percpu_buffer; + +	/* +	 * If we have allocated per cpu buffers, then we do not +	 * need to do any locking. +	 */ +	if (in_nmi()) +		percpu_buffer = trace_percpu_nmi_buffer; +	else if (in_irq()) +		percpu_buffer = trace_percpu_irq_buffer; +	else if (in_softirq()) +		percpu_buffer = trace_percpu_sirq_buffer; +	else +		percpu_buffer = trace_percpu_buffer; + +	if (!percpu_buffer) +		return NULL; + +	return this_cpu_ptr(&percpu_buffer->buffer[0]); +} + +static int alloc_percpu_trace_buffer(void) +{ +	struct trace_buffer_struct *buffers; +	struct trace_buffer_struct *sirq_buffers; +	struct trace_buffer_struct *irq_buffers; +	struct trace_buffer_struct *nmi_buffers; + +	buffers = alloc_percpu(struct trace_buffer_struct); +	if (!buffers) +		goto err_warn; + +	sirq_buffers = alloc_percpu(struct trace_buffer_struct); +	if (!sirq_buffers) +		goto err_sirq; + +	irq_buffers = alloc_percpu(struct trace_buffer_struct); +	if (!irq_buffers) +		goto err_irq; + +	nmi_buffers = alloc_percpu(struct trace_buffer_struct); +	if (!nmi_buffers) +		goto err_nmi; + +	trace_percpu_buffer = buffers; +	trace_percpu_sirq_buffer = sirq_buffers; +	trace_percpu_irq_buffer = irq_buffers; +	trace_percpu_nmi_buffer = nmi_buffers; + +	return 0; + + err_nmi: +	free_percpu(irq_buffers); + err_irq: +	free_percpu(sirq_buffers); + err_sirq: +	free_percpu(buffers); + err_warn: +	WARN(1, "Could not allocate percpu trace_printk buffer"); +	return -ENOMEM; +} + +static int buffers_allocated; + +void trace_printk_init_buffers(void) +{ +	if (buffers_allocated) +		return; + +	if (alloc_percpu_trace_buffer()) +		return; + +	/* trace_printk() is for debug use only. Don't use it in production. */ + +	pr_warning("\n**********************************************************\n"); +	pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("** trace_printk() being used. Allocating extra memory.  **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("** This means that this is a DEBUG kernel and it is     **\n"); +	pr_warning("** unsafe for produciton use.                           **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("** If you see this message and you are not debugging    **\n"); +	pr_warning("** the kernel, report this immediately to your vendor!  **\n"); +	pr_warning("**                                                      **\n"); +	pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n"); +	pr_warning("**********************************************************\n"); + +	/* Expand the buffers to set size */ +	tracing_update_buffers(); + +	buffers_allocated = 1; + +	/* +	 * trace_printk_init_buffers() can be called by modules. +	 * If that happens, then we need to start cmdline recording +	 * directly here. If the global_trace.buffer is already +	 * allocated here, then this was called by module code. +	 */ +	if (global_trace.trace_buffer.buffer) +		tracing_start_cmdline_record(); +} + +void trace_printk_start_comm(void) +{ +	/* Start tracing comms if trace printk is set */ +	if (!buffers_allocated) +		return; +	tracing_start_cmdline_record(); +} + +static void trace_printk_start_stop_comm(int enabled) +{ +	if (!buffers_allocated) +		return; + +	if (enabled) +		tracing_start_cmdline_record(); +	else +		tracing_stop_cmdline_record(); +} +  /**   * trace_vbprintk - write binary msg to tracing buffer   *   */  int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  { -	static arch_spinlock_t trace_buf_lock = -		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -	static u32 trace_buf[TRACE_BUF_SIZE]; -  	struct ftrace_event_call *call = &event_bprint;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	struct trace_array *tr = &global_trace; -	struct trace_array_cpu *data;  	struct bprint_entry *entry;  	unsigned long flags; -	int disable; -	int cpu, len = 0, size, pc; +	char *tbuffer; +	int len = 0, size, pc;  	if (unlikely(tracing_selftest_running || tracing_disabled))  		return 0; @@ -1358,43 +2116,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	pc = preempt_count();  	preempt_disable_notrace(); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disable = atomic_inc_return(&data->disabled); -	if (unlikely(disable != 1)) +	tbuffer = get_trace_buf(); +	if (!tbuffer) { +		len = 0;  		goto out; +	} -	/* Lockdep uses trace_printk for lock tracing */ -	local_irq_save(flags); -	arch_spin_lock(&trace_buf_lock); -	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); +	len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); -	if (len > TRACE_BUF_SIZE || len < 0) -		goto out_unlock; +	if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) +		goto out; +	local_save_flags(flags);  	size = sizeof(*entry) + sizeof(u32) * len; -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,  					  flags, pc);  	if (!event) -		goto out_unlock; +		goto out;  	entry = ring_buffer_event_data(event);  	entry->ip			= ip;  	entry->fmt			= fmt; -	memcpy(entry->buf, trace_buf, sizeof(u32) * len); -	if (!filter_check_discard(call, entry, buffer, event)) { -		ring_buffer_unlock_commit(buffer, event); +	memcpy(entry->buf, tbuffer, sizeof(u32) * len); +	if (!call_filter_check_discard(call, entry, buffer, event)) { +		__buffer_unlock_commit(buffer, event);  		ftrace_trace_stack(buffer, flags, 6, pc);  	} -out_unlock: -	arch_spin_unlock(&trace_buf_lock); -	local_irq_restore(flags); -  out: -	atomic_dec_return(&data->disabled);  	preempt_enable_notrace();  	unpause_graph_tracing(); @@ -1402,80 +2153,95 @@ out:  }  EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, -		       unsigned long ip, const char *fmt, ...) -{ -	int ret; -	va_list ap; - -	if (!(trace_flags & TRACE_ITER_PRINTK)) -		return 0; - -	va_start(ap, fmt); -	ret = trace_array_vprintk(tr, ip, fmt, ap); -	va_end(ap); -	return ret; -} - -int trace_array_vprintk(struct trace_array *tr, -			unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, +		      unsigned long ip, const char *fmt, va_list args)  { -	static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; -	static char trace_buf[TRACE_BUF_SIZE]; -  	struct ftrace_event_call *call = &event_print;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer; -	struct trace_array_cpu *data; -	int cpu, len = 0, size, pc; +	int len = 0, size, pc;  	struct print_entry *entry; -	unsigned long irq_flags; -	int disable; +	unsigned long flags; +	char *tbuffer;  	if (tracing_disabled || tracing_selftest_running)  		return 0; +	/* Don't pollute graph traces with trace_vprintk internals */ +	pause_graph_tracing(); +  	pc = preempt_count();  	preempt_disable_notrace(); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disable = atomic_inc_return(&data->disabled); -	if (unlikely(disable != 1)) + +	tbuffer = get_trace_buf(); +	if (!tbuffer) { +		len = 0;  		goto out; +	} -	pause_graph_tracing(); -	raw_local_irq_save(irq_flags); -	arch_spin_lock(&trace_buf_lock); -	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); +	len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); +	if (len > TRACE_BUF_SIZE) +		goto out; +	local_save_flags(flags);  	size = sizeof(*entry) + len + 1; -	buffer = tr->buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, -					  irq_flags, pc); +					  flags, pc);  	if (!event) -		goto out_unlock; +		goto out;  	entry = ring_buffer_event_data(event);  	entry->ip = ip; -	memcpy(&entry->buf, trace_buf, len); +	memcpy(&entry->buf, tbuffer, len);  	entry->buf[len] = '\0'; -	if (!filter_check_discard(call, entry, buffer, event)) { -		ring_buffer_unlock_commit(buffer, event); -		ftrace_trace_stack(buffer, irq_flags, 6, pc); +	if (!call_filter_check_discard(call, entry, buffer, event)) { +		__buffer_unlock_commit(buffer, event); +		ftrace_trace_stack(buffer, flags, 6, pc);  	} - - out_unlock: -	arch_spin_unlock(&trace_buf_lock); -	raw_local_irq_restore(irq_flags); -	unpause_graph_tracing();   out: -	atomic_dec_return(&data->disabled);  	preempt_enable_notrace(); +	unpause_graph_tracing();  	return len;  } +int trace_array_vprintk(struct trace_array *tr, +			unsigned long ip, const char *fmt, va_list args) +{ +	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, +		       unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = trace_array_vprintk(tr, ip, fmt, ap); +	va_end(ap); +	return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...) +{ +	int ret; +	va_list ap; + +	if (!(trace_flags & TRACE_ITER_PRINTK)) +		return 0; + +	va_start(ap, fmt); +	ret = __trace_array_vprintk(buffer, ip, fmt, ap); +	va_end(ap); +	return ret; +} +  int trace_vprintk(unsigned long ip, const char *fmt, va_list args)  {  	return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1484,14 +2250,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);  static void trace_iterator_increment(struct trace_iterator *iter)  { -	/* Don't allow ftrace to trace into the ring buffers */ -	ftrace_disable_cpu(); +	struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);  	iter->idx++; -	if (iter->buffer_iter[iter->cpu]) -		ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - -	ftrace_enable_cpu(); +	if (buf_iter) +		ring_buffer_read(buf_iter, NULL);  }  static struct trace_entry * @@ -1499,39 +2262,40 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,  		unsigned long *lost_events)  {  	struct ring_buffer_event *event; -	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - -	/* Don't allow ftrace to trace into the ring buffers */ -	ftrace_disable_cpu(); +	struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);  	if (buf_iter)  		event = ring_buffer_iter_peek(buf_iter, ts);  	else -		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, +		event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,  					 lost_events); -	ftrace_enable_cpu(); - -	return event ? ring_buffer_event_data(event) : NULL; +	if (event) { +		iter->ent_size = ring_buffer_event_length(event); +		return ring_buffer_event_data(event); +	} +	iter->ent_size = 0; +	return NULL;  }  static struct trace_entry *  __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  		  unsigned long *missing_events, u64 *ent_ts)  { -	struct ring_buffer *buffer = iter->tr->buffer; +	struct ring_buffer *buffer = iter->trace_buffer->buffer;  	struct trace_entry *ent, *next = NULL;  	unsigned long lost_events = 0, next_lost = 0;  	int cpu_file = iter->cpu_file;  	u64 next_ts = 0, ts;  	int next_cpu = -1; +	int next_size = 0;  	int cpu;  	/*  	 * If we are in a per_cpu trace file, don't bother by iterating over  	 * all cpu and peek directly.  	 */ -	if (cpu_file > TRACE_PIPE_ALL_CPU) { +	if (cpu_file > RING_BUFFER_ALL_CPUS) {  		if (ring_buffer_empty_cpu(buffer, cpu_file))  			return NULL;  		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1556,9 +2320,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,  			next_cpu = cpu;  			next_ts = ts;  			next_lost = lost_events; +			next_size = iter->ent_size;  		}  	} +	iter->ent_size = next_size; +  	if (ent_cpu)  		*ent_cpu = next_cpu; @@ -1592,11 +2359,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)  static void trace_consume(struct trace_iterator *iter)  { -	/* Don't allow ftrace to trace into the ring buffers */ -	ftrace_disable_cpu(); -	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, +	ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,  			    &iter->lost_events); -	ftrace_enable_cpu();  }  static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1628,18 +2392,17 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)  void tracing_iter_reset(struct trace_iterator *iter, int cpu)  { -	struct trace_array *tr = iter->tr;  	struct ring_buffer_event *event;  	struct ring_buffer_iter *buf_iter;  	unsigned long entries = 0;  	u64 ts; -	tr->data[cpu]->skipped_entries = 0; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; -	if (!iter->buffer_iter[cpu]) +	buf_iter = trace_buffer_iter(iter, cpu); +	if (!buf_iter)  		return; -	buf_iter = iter->buffer_iter[cpu];  	ring_buffer_iter_reset(buf_iter);  	/* @@ -1648,13 +2411,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  	 * by the timestamp being before the start of the buffer.  	 */  	while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { -		if (ts >= iter->tr->time_start) +		if (ts >= iter->trace_buffer->time_start)  			break;  		entries++;  		ring_buffer_read(buf_iter, NULL);  	} -	tr->data[cpu]->skipped_entries = entries; +	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;  }  /* @@ -1664,37 +2427,42 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)  static void *s_start(struct seq_file *m, loff_t *pos)  {  	struct trace_iterator *iter = m->private; -	static struct tracer *old_tracer; +	struct trace_array *tr = iter->tr;  	int cpu_file = iter->cpu_file;  	void *p = NULL;  	loff_t l = 0;  	int cpu; -	/* copy the tracer to avoid using a global lock all around */ +	/* +	 * copy the tracer to avoid using a global lock all around. +	 * iter->trace is a copy of current_trace, the pointer to the +	 * name may be used instead of a strcmp(), as iter->trace->name +	 * will point to the same string as current_trace->name. +	 */  	mutex_lock(&trace_types_lock); -	if (unlikely(old_tracer != current_trace && current_trace)) { -		old_tracer = current_trace; -		*iter->trace = *current_trace; -	} +	if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock); -	atomic_inc(&trace_record_cmdline_disabled); +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->trace->use_max_tr) +		return ERR_PTR(-EBUSY); +#endif + +	if (!iter->snapshot) +		atomic_inc(&trace_record_cmdline_disabled);  	if (*pos != iter->pos) {  		iter->ent = NULL;  		iter->cpu = 0;  		iter->idx = -1; -		ftrace_disable_cpu(); - -		if (cpu_file == TRACE_PIPE_ALL_CPU) { +		if (cpu_file == RING_BUFFER_ALL_CPUS) {  			for_each_tracing_cpu(cpu)  				tracing_iter_reset(iter, cpu);  		} else  			tracing_iter_reset(iter, cpu_file); -		ftrace_enable_cpu(); -  		iter->leftover = 0;  		for (p = iter; p && l < *pos; p = s_next(m, p, &l))  			; @@ -1721,11 +2489,46 @@ static void s_stop(struct seq_file *m, void *p)  {  	struct trace_iterator *iter = m->private; -	atomic_dec(&trace_record_cmdline_disabled); +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->trace->use_max_tr) +		return; +#endif + +	if (!iter->snapshot) +		atomic_dec(&trace_record_cmdline_disabled); +  	trace_access_unlock(iter->cpu_file);  	trace_event_read_unlock();  } +static void +get_total_entries(struct trace_buffer *buf, +		  unsigned long *total, unsigned long *entries) +{ +	unsigned long count; +	int cpu; + +	*total = 0; +	*entries = 0; + +	for_each_tracing_cpu(cpu) { +		count = ring_buffer_entries_cpu(buf->buffer, cpu); +		/* +		 * If this buffer has skipped entries, then we hold all +		 * entries for the trace and we need to ignore the +		 * ones before the time stamp. +		 */ +		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { +			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; +			/* total is the same as the entries */ +			*total += count; +		} else +			*total += count + +				ring_buffer_overrun_cpu(buf->buffer, cpu); +		*entries += count; +	} +} +  static void print_lat_help_header(struct seq_file *m)  {  	seq_puts(m, "#                  _------=> CPU#            \n"); @@ -1733,52 +2536,55 @@ static void print_lat_help_header(struct seq_file *m)  	seq_puts(m, "#                | / _----=> need-resched    \n");  	seq_puts(m, "#                || / _---=> hardirq/softirq \n");  	seq_puts(m, "#                ||| / _--=> preempt-depth   \n"); -	seq_puts(m, "#                |||| /_--=> lock-depth       \n"); -	seq_puts(m, "#                |||||/     delay             \n"); -	seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n"); -	seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n"); +	seq_puts(m, "#                |||| /     delay             \n"); +	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n"); +	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n"); +} + +static void print_event_info(struct trace_buffer *buf, struct seq_file *m) +{ +	unsigned long total; +	unsigned long entries; + +	get_total_entries(buf, &total, &entries); +	seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n", +		   entries, total, num_online_cpus()); +	seq_puts(m, "#\n");  } -static void print_func_help_header(struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)  { -	seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n"); +	print_event_info(buf, m); +	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");  	seq_puts(m, "#              | |       |          |         |\n");  } +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +{ +	print_event_info(buf, m); +	seq_puts(m, "#                              _-----=> irqs-off\n"); +	seq_puts(m, "#                             / _----=> need-resched\n"); +	seq_puts(m, "#                            | / _---=> hardirq/softirq\n"); +	seq_puts(m, "#                            || / _--=> preempt-depth\n"); +	seq_puts(m, "#                            ||| /     delay\n"); +	seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"); +	seq_puts(m, "#              | |       |   ||||       |         |\n"); +}  void  print_trace_header(struct seq_file *m, struct trace_iterator *iter)  {  	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); -	struct trace_array *tr = iter->tr; -	struct trace_array_cpu *data = tr->data[tr->cpu]; -	struct tracer *type = current_trace; -	unsigned long entries = 0; -	unsigned long total = 0; -	unsigned long count; +	struct trace_buffer *buf = iter->trace_buffer; +	struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); +	struct tracer *type = iter->trace; +	unsigned long entries; +	unsigned long total;  	const char *name = "preemption"; -	int cpu; -	if (type) -		name = type->name; +	name = type->name; - -	for_each_tracing_cpu(cpu) { -		count = ring_buffer_entries_cpu(tr->buffer, cpu); -		/* -		 * If this buffer has skipped entries, then we hold all -		 * entries for the trace and we need to ignore the -		 * ones before the time stamp. -		 */ -		if (tr->data[cpu]->skipped_entries) { -			count -= tr->data[cpu]->skipped_entries; -			/* total is the same as the entries */ -			total += count; -		} else -			total += count + -				ring_buffer_overrun_cpu(tr->buffer, cpu); -		entries += count; -	} +	get_total_entries(buf, &total, &entries);  	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",  		   name, UTS_RELEASE); @@ -1789,7 +2595,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  		   nsecs_to_usecs(data->saved_latency),  		   entries,  		   total, -		   tr->cpu, +		   buf->cpu,  #if defined(CONFIG_PREEMPT_NONE)  		   "server",  #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -1809,7 +2615,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  	seq_puts(m, "#    -----------------\n");  	seq_printf(m, "#    | task: %.16s-%d "  		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", -		   data->comm, data->pid, data->uid, data->nice, +		   data->comm, data->pid, +		   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,  		   data->policy, data->rt_priority);  	seq_puts(m, "#    -----------------\n"); @@ -1839,7 +2646,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)  	if (cpumask_test_cpu(iter->cpu, iter->started))  		return; -	if (iter->tr->data[iter->cpu]->skipped_entries) +	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)  		return;  	cpumask_set_cpu(iter->cpu, iter->started); @@ -1958,27 +2765,30 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)  int trace_empty(struct trace_iterator *iter)  { +	struct ring_buffer_iter *buf_iter;  	int cpu;  	/* If we are looking at one CPU buffer, only check that one */ -	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { +	if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {  		cpu = iter->cpu_file; -		if (iter->buffer_iter[cpu]) { -			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) +		buf_iter = trace_buffer_iter(iter, cpu); +		if (buf_iter) { +			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  		return 1;  	}  	for_each_tracing_cpu(cpu) { -		if (iter->buffer_iter[cpu]) { -			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) +		buf_iter = trace_buffer_iter(iter, cpu); +		if (buf_iter) { +			if (!ring_buffer_iter_empty(buf_iter))  				return 0;  		} else { -			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) +			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))  				return 0;  		}  	} @@ -1991,9 +2801,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)  {  	enum print_line_t ret; -	if (iter->lost_events) -		trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", -				 iter->cpu, iter->lost_events); +	if (iter->lost_events && +	    !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", +				 iter->cpu, iter->lost_events)) +		return TRACE_TYPE_PARTIAL_LINE;  	if (iter->trace && iter->trace->print_line) {  		ret = iter->trace->print_line(iter); @@ -2001,6 +2812,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)  			return ret;  	} +	if (iter->ent->type == TRACE_BPUTS && +			trace_flags & TRACE_ITER_PRINTK && +			trace_flags & TRACE_ITER_PRINTK_MSGONLY) +		return trace_print_bputs_msg_only(iter); +  	if (iter->ent->type == TRACE_BPRINT &&  			trace_flags & TRACE_ITER_PRINTK &&  			trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2023,10 +2839,28 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)  	return print_trace_fmt(iter);  } +void trace_latency_header(struct seq_file *m) +{ +	struct trace_iterator *iter = m->private; + +	/* print nothing if the buffers are empty */ +	if (trace_empty(iter)) +		return; + +	if (iter->iter_flags & TRACE_FILE_LAT_FMT) +		print_trace_header(m, iter); + +	if (!(trace_flags & TRACE_ITER_VERBOSE)) +		print_lat_help_header(m); +} +  void trace_default_header(struct seq_file *m)  {  	struct trace_iterator *iter = m->private; +	if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) +		return; +  	if (iter->iter_flags & TRACE_FILE_LAT_FMT) {  		/* print nothing if the buffers are empty */  		if (trace_empty(iter)) @@ -2035,11 +2869,67 @@ void trace_default_header(struct seq_file *m)  		if (!(trace_flags & TRACE_ITER_VERBOSE))  			print_lat_help_header(m);  	} else { -		if (!(trace_flags & TRACE_ITER_VERBOSE)) -			print_func_help_header(m); +		if (!(trace_flags & TRACE_ITER_VERBOSE)) { +			if (trace_flags & TRACE_ITER_IRQ_INFO) +				print_func_help_header_irq(iter->trace_buffer, m); +			else +				print_func_help_header(iter->trace_buffer, m); +		}  	}  } +static void test_ftrace_alive(struct seq_file *m) +{ +	if (!ftrace_is_dead()) +		return; +	seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); +	seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n"); +} + +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n"); +	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ +	seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); +	seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n"); +#else +	seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); +	seq_printf(m, "#                     Must use main snapshot file to allocate.\n"); +#endif +	seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n"); +	seq_printf(m, "#                       is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ +	if (iter->tr->allocated_snapshot) +		seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); +	else +		seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + +	seq_printf(m, "# Snapshot commands:\n"); +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +		show_snapshot_main_help(m); +	else +		show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif +  static int s_show(struct seq_file *m, void *v)  {  	struct trace_iterator *iter = v; @@ -2049,8 +2939,11 @@ static int s_show(struct seq_file *m, void *v)  		if (iter->tr) {  			seq_printf(m, "# tracer: %s\n", iter->trace->name);  			seq_puts(m, "#\n"); +			test_ftrace_alive(m);  		} -		if (iter->trace && iter->trace->print_header) +		if (iter->snapshot && trace_empty(iter)) +			print_snapshot_help(m, iter); +		else if (iter->trace && iter->trace->print_header)  			iter->trace->print_header(m);  		else  			trace_default_header(m); @@ -2081,6 +2974,17 @@ static int s_show(struct seq_file *m, void *v)  	return 0;  } +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ +	if (inode->i_cdev) /* See trace_create_cpu_file() */ +		return (long)inode->i_cdev - 1; +	return RING_BUFFER_ALL_CPUS; +} +  static const struct seq_operations tracer_seq_ops = {  	.start		= s_start,  	.next		= s_next, @@ -2089,21 +2993,24 @@ static const struct seq_operations tracer_seq_ops = {  };  static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot)  { -	long cpu_file = (long) inode->i_private; -	void *fail_ret = ERR_PTR(-ENOMEM); +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter; -	struct seq_file *m; -	int cpu, ret; +	int cpu;  	if (tracing_disabled)  		return ERR_PTR(-ENODEV); -	iter = kzalloc(sizeof(*iter), GFP_KERNEL); +	iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));  	if (!iter)  		return ERR_PTR(-ENOMEM); +	iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), +				    GFP_KERNEL); +	if (!iter->buffer_iter) +		goto release; +  	/*  	 * We make a copy of the current tracer to avoid concurrent  	 * changes on it while we are reading. @@ -2113,35 +3020,45 @@ __tracing_open(struct inode *inode, struct file *file)  	if (!iter->trace)  		goto fail; -	if (current_trace) -		*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))  		goto fail; -	if (current_trace && current_trace->print_max) -		iter->tr = &max_tr; +	iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE +	/* Currently only the top directory has a snapshot */ +	if (tr->current_trace->print_max || snapshot) +		iter->trace_buffer = &tr->max_buffer;  	else -		iter->tr = &global_trace; +#endif +		iter->trace_buffer = &tr->trace_buffer; +	iter->snapshot = snapshot;  	iter->pos = -1; +	iter->cpu_file = tracing_get_cpu(inode);  	mutex_init(&iter->mutex); -	iter->cpu_file = cpu_file;  	/* Notify the tracer early; before we stop tracing. */  	if (iter->trace && iter->trace->open)  		iter->trace->open(iter);  	/* Annotate start of buffers if we had overruns */ -	if (ring_buffer_overruns(iter->tr->buffer)) +	if (ring_buffer_overruns(iter->trace_buffer->buffer))  		iter->iter_flags |= TRACE_FILE_ANNOTATE; -	/* stop the trace while dumping */ -	tracing_stop(); +	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ +	if (trace_clocks[tr->clock_id].in_ns) +		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; -	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { +	/* stop the trace while dumping if we are not opening "snapshot" */ +	if (!iter->snapshot) +		tracing_stop_tr(tr); + +	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter->buffer_iter[cpu] = -				ring_buffer_read_prepare(iter->tr->buffer, cpu); +				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		}  		ring_buffer_read_prepare_sync();  		for_each_tracing_cpu(cpu) { @@ -2151,38 +3068,23 @@ __tracing_open(struct inode *inode, struct file *file)  	} else {  		cpu = iter->cpu_file;  		iter->buffer_iter[cpu] = -			ring_buffer_read_prepare(iter->tr->buffer, cpu); +			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);  		ring_buffer_read_prepare_sync();  		ring_buffer_read_start(iter->buffer_iter[cpu]);  		tracing_iter_reset(iter, cpu);  	} -	ret = seq_open(file, &tracer_seq_ops); -	if (ret < 0) { -		fail_ret = ERR_PTR(ret); -		goto fail_buffer; -	} - -	m = file->private_data; -	m->private = iter; -  	mutex_unlock(&trace_types_lock);  	return iter; - fail_buffer: -	for_each_tracing_cpu(cpu) { -		if (iter->buffer_iter[cpu]) -			ring_buffer_read_finish(iter->buffer_iter[cpu]); -	} -	free_cpumask_var(iter->started); -	tracing_start();   fail:  	mutex_unlock(&trace_types_lock);  	kfree(iter->trace); -	kfree(iter); - -	return fail_ret; +	kfree(iter->buffer_iter); +release: +	seq_release_private(inode, file); +	return ERR_PTR(-ENOMEM);  }  int tracing_open_generic(struct inode *inode, struct file *filp) @@ -2194,18 +3096,46 @@ int tracing_open_generic(struct inode *inode, struct file *filp)  	return 0;  } +bool tracing_is_disabled(void) +{ +	return (tracing_disabled) ? true: false; +} + +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ +	struct trace_array *tr = inode->i_private; + +	if (tracing_disabled) +		return -ENODEV; + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	filp->private_data = inode->i_private; + +	return 0; +} +  static int tracing_release(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private;  	struct seq_file *m = file->private_data;  	struct trace_iterator *iter;  	int cpu; -	if (!(file->f_mode & FMODE_READ)) +	if (!(file->f_mode & FMODE_READ)) { +		trace_array_put(tr);  		return 0; +	} +	/* Writes do not use seq_file */  	iter = m->private; -  	mutex_lock(&trace_types_lock); +  	for_each_tracing_cpu(cpu) {  		if (iter->buffer_iter[cpu])  			ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2214,65 +3144,119 @@ static int tracing_release(struct inode *inode, struct file *file)  	if (iter->trace && iter->trace->close)  		iter->trace->close(iter); -	/* reenable tracing if it was previously enabled */ -	tracing_start(); +	if (!iter->snapshot) +		/* reenable tracing if it was previously enabled */ +		tracing_start_tr(tr); + +	__trace_array_put(tr); +  	mutex_unlock(&trace_types_lock); -	seq_release(inode, file);  	mutex_destroy(&iter->mutex);  	free_cpumask_var(iter->started);  	kfree(iter->trace); -	kfree(iter); +	kfree(iter->buffer_iter); +	seq_release_private(inode, file); +  	return 0;  } +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; + +	trace_array_put(tr); +	return 0; +} + +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; + +	trace_array_put(tr); + +	return single_release(inode, file); +} +  static int tracing_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter;  	int ret = 0; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	/* If this file was open for write, then erase contents */ -	if ((file->f_mode & FMODE_WRITE) && -	    (file->f_flags & O_TRUNC)) { -		long cpu = (long) inode->i_private; +	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { +		int cpu = tracing_get_cpu(inode); -		if (cpu == TRACE_PIPE_ALL_CPU) -			tracing_reset_online_cpus(&global_trace); +		if (cpu == RING_BUFFER_ALL_CPUS) +			tracing_reset_online_cpus(&tr->trace_buffer);  		else -			tracing_reset(&global_trace, cpu); +			tracing_reset(&tr->trace_buffer, cpu);  	}  	if (file->f_mode & FMODE_READ) { -		iter = __tracing_open(inode, file); +		iter = __tracing_open(inode, file, false);  		if (IS_ERR(iter))  			ret = PTR_ERR(iter);  		else if (trace_flags & TRACE_ITER_LATENCY_FMT)  			iter->iter_flags |= TRACE_FILE_LAT_FMT;  	} + +	if (ret < 0) +		trace_array_put(tr); +  	return ret;  } +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ +	return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ +	while (t && !trace_ok_for_array(t, tr)) +		t = t->next; + +	return t; +} +  static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  { +	struct trace_array *tr = m->private;  	struct tracer *t = v;  	(*pos)++;  	if (t) -		t = t->next; +		t = get_tracer_for_array(tr, t->next);  	return t;  }  static void *t_start(struct seq_file *m, loff_t *pos)  { +	struct trace_array *tr = m->private;  	struct tracer *t;  	loff_t l = 0;  	mutex_lock(&trace_types_lock); -	for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) -		; + +	t = get_tracer_for_array(tr, trace_types); +	for (; t && l < *pos; t = t_next(m, t, &l)) +			;  	return t;  } @@ -2307,10 +3291,21 @@ static const struct seq_operations show_traces_seq_ops = {  static int show_traces_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private; +	struct seq_file *m; +	int ret; +  	if (tracing_disabled)  		return -ENODEV; -	return seq_open(file, &show_traces_seq_ops); +	ret = seq_open(file, &show_traces_seq_ops); +	if (ret) +		return ret; + +	m = file->private_data; +	m->private = tr; + +	return 0;  }  static ssize_t @@ -2320,11 +3315,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,  	return count;  } +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) +{ +	int ret; + +	if (file->f_mode & FMODE_READ) +		ret = seq_lseek(file, offset, whence); +	else +		file->f_pos = ret = 0; + +	return ret; +} +  static const struct file_operations tracing_fops = {  	.open		= tracing_open,  	.read		= seq_read,  	.write		= tracing_write_stub, -	.llseek		= seq_lseek, +	.llseek		= tracing_lseek,  	.release	= tracing_release,  }; @@ -2336,11 +3343,6 @@ static const struct file_operations show_traces_fops = {  };  /* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_var_t tracing_cpumask; - -/*   * The tracer itself will not take this lock, but still we want   * to provide a consistent cpumask to user-space:   */ @@ -2356,11 +3358,12 @@ static ssize_t  tracing_cpumask_read(struct file *filp, char __user *ubuf,  		     size_t count, loff_t *ppos)  { +	struct trace_array *tr = file_inode(filp)->i_private;  	int len;  	mutex_lock(&tracing_cpumask_update_lock); -	len = cpumask_scnprintf(mask_str, count, tracing_cpumask); +	len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);  	if (count - len < 2) {  		count = -EINVAL;  		goto out_err; @@ -2378,8 +3381,9 @@ static ssize_t  tracing_cpumask_write(struct file *filp, const char __user *ubuf,  		      size_t count, loff_t *ppos)  { -	int err, cpu; +	struct trace_array *tr = file_inode(filp)->i_private;  	cpumask_var_t tracing_cpumask_new; +	int err, cpu;  	if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))  		return -ENOMEM; @@ -2391,25 +3395,27 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,  	mutex_lock(&tracing_cpumask_update_lock);  	local_irq_disable(); -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&tr->max_lock);  	for_each_tracing_cpu(cpu) {  		/*  		 * Increase/decrease the disabled counter if we are  		 * about to flip a bit in the cpumask:  		 */ -		if (cpumask_test_cpu(cpu, tracing_cpumask) && +		if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&  				!cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_inc(&global_trace.data[cpu]->disabled); +			atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);  		} -		if (!cpumask_test_cpu(cpu, tracing_cpumask) && +		if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&  				cpumask_test_cpu(cpu, tracing_cpumask_new)) { -			atomic_dec(&global_trace.data[cpu]->disabled); +			atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); +			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);  		}  	} -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&tr->max_lock);  	local_irq_enable(); -	cpumask_copy(tracing_cpumask, tracing_cpumask_new); +	cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);  	mutex_unlock(&tracing_cpumask_update_lock);  	free_cpumask_var(tracing_cpumask_new); @@ -2423,21 +3429,23 @@ err_unlock:  }  static const struct file_operations tracing_cpumask_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= tracing_cpumask_read,  	.write		= tracing_cpumask_write, +	.release	= tracing_release_generic_tr,  	.llseek		= generic_file_llseek,  };  static int tracing_trace_options_show(struct seq_file *m, void *v)  {  	struct tracer_opt *trace_opts; +	struct trace_array *tr = m->private;  	u32 tracer_flags;  	int i;  	mutex_lock(&trace_types_lock); -	tracer_flags = current_trace->flags->val; -	trace_opts = current_trace->flags->opts; +	tracer_flags = tr->current_trace->flags->val; +	trace_opts = tr->current_trace->flags->opts;  	for (i = 0; trace_options[i]; i++) {  		if (trace_flags & (1 << i)) @@ -2457,13 +3465,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)  	return 0;  } -static int __set_tracer_option(struct tracer *trace, +static int __set_tracer_option(struct trace_array *tr,  			       struct tracer_flags *tracer_flags,  			       struct tracer_opt *opts, int neg)  { +	struct tracer *trace = tr->current_trace;  	int ret; -	ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); +	ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);  	if (ret)  		return ret; @@ -2475,8 +3484,9 @@ static int __set_tracer_option(struct tracer *trace,  }  /* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)  { +	struct tracer *trace = tr->current_trace;  	struct tracer_flags *tracer_flags = trace->flags;  	struct tracer_opt *opts = NULL;  	int i; @@ -2485,18 +3495,31 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)  		opts = &tracer_flags->opts[i];  		if (strcmp(cmp, opts->name) == 0) -			return __set_tracer_option(trace, trace->flags, -						   opts, neg); +			return __set_tracer_option(tr, trace->flags, opts, neg);  	}  	return -EINVAL;  } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ +	if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) +		return -1; + +	return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)  {  	/* do nothing if flag is already set */  	if (!!(trace_flags & mask) == !!enabled) -		return; +		return 0; + +	/* Give the tracer a chance to approve the change */ +	if (tr->current_trace->flag_changed) +		if (tr->current_trace->flag_changed(tr, mask, !!enabled)) +			return -EINVAL;  	if (enabled)  		trace_flags |= mask; @@ -2505,47 +3528,72 @@ static void set_tracer_flags(unsigned int mask, int enabled)  	if (mask == TRACE_ITER_RECORD_CMD)  		trace_event_enable_cmd_record(enabled); + +	if (mask == TRACE_ITER_OVERWRITE) { +		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE +		ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif +	} + +	if (mask == TRACE_ITER_PRINTK) +		trace_printk_start_stop_comm(enabled); + +	return 0;  } -static ssize_t -tracing_trace_options_write(struct file *filp, const char __user *ubuf, -			size_t cnt, loff_t *ppos) +static int trace_set_options(struct trace_array *tr, char *option)  { -	char buf[64];  	char *cmp;  	int neg = 0; -	int ret; +	int ret = -ENODEV;  	int i; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; -	cmp = strstrip(buf); +	cmp = strstrip(option);  	if (strncmp(cmp, "no", 2) == 0) {  		neg = 1;  		cmp += 2;  	} +	mutex_lock(&trace_types_lock); +  	for (i = 0; trace_options[i]; i++) {  		if (strcmp(cmp, trace_options[i]) == 0) { -			set_tracer_flags(1 << i, !neg); +			ret = set_tracer_flag(tr, 1 << i, !neg);  			break;  		}  	}  	/* If no option could be set, test the specific tracer options */ -	if (!trace_options[i]) { -		mutex_lock(&trace_types_lock); -		ret = set_tracer_option(current_trace, cmp, neg); -		mutex_unlock(&trace_types_lock); -		if (ret) -			return ret; -	} +	if (!trace_options[i]) +		ret = set_tracer_option(tr, cmp, neg); + +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, +			size_t cnt, loff_t *ppos) +{ +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private; +	char buf[64]; +	int ret; + +	if (cnt >= sizeof(buf)) +		return -EINVAL; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; + +	ret = trace_set_options(tr, buf); +	if (ret < 0) +		return ret;  	*ppos += cnt; @@ -2554,35 +3602,156 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,  static int tracing_trace_options_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private; +	int ret; +  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_trace_options_show, NULL); + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	ret = single_open(file, tracing_trace_options_show, inode->i_private); +	if (ret < 0) +		trace_array_put(tr); + +	return ret;  }  static const struct file_operations tracing_iter_fops = {  	.open		= tracing_trace_options_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= tracing_single_release_tr,  	.write		= tracing_trace_options_write,  };  static const char readme_msg[] =  	"tracing mini-HOWTO:\n\n" -	"# mount -t debugfs nodev /sys/kernel/debug\n\n" -	"# cat /sys/kernel/debug/tracing/available_tracers\n" -	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"nop\n" -	"# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" -	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"sched_switch\n" -	"# cat /sys/kernel/debug/tracing/trace_options\n" -	"noprint-parent nosym-offset nosym-addr noverbose\n" -	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" -	"# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" -	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" -	"# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" +	"# echo 0 > tracing_on : quick way to disable tracing\n" +	"# echo 1 > tracing_on : quick way to re-enable tracing\n\n" +	" Important files:\n" +	"  trace\t\t\t- The static contents of the buffer\n" +	"\t\t\t  To clear the buffer write into this file: echo > trace\n" +	"  trace_pipe\t\t- A consuming read to see the contents of the buffer\n" +	"  current_tracer\t- function and latency tracers\n" +	"  available_tracers\t- list of configured tracers for current_tracer\n" +	"  buffer_size_kb\t- view and modify size of per cpu buffer\n" +	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n" +	"  trace_clock\t\t-change the clock used to order events\n" +	"       local:   Per cpu clock but may not be synced across CPUs\n" +	"      global:   Synced across CPUs but slows tracing down.\n" +	"     counter:   Not a clock, but just an increment\n" +	"      uptime:   Jiffy counter from time of boot\n" +	"        perf:   Same clock that perf events use\n" +#ifdef CONFIG_X86_64 +	"     x86-tsc:   TSC cycle counter\n" +#endif +	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n" +	"  tracing_cpumask\t- Limit which CPUs to trace\n" +	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n" +	"\t\t\t  Remove sub-buffer with rmdir\n" +	"  trace_options\t\t- Set format or modify how tracing happens\n" +	"\t\t\t  Disable an option by adding a suffix 'no' to the\n" +	"\t\t\t  option name\n" +	"  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"\n  available_filter_functions - list of functions that can be filtered on\n" +	"  set_ftrace_filter\t- echo function name in here to only trace these\n" +	"\t\t\t  functions\n" +	"\t     accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"\t     modules: Can select a group via module\n" +	"\t      Format: :mod:<module-name>\n" +	"\t     example: echo :mod:ext3 > set_ftrace_filter\n" +	"\t    triggers: a command to perform when function is hit\n" +	"\t      Format: <function>:<trigger>[:count]\n" +	"\t     trigger: traceon, traceoff\n" +	"\t\t      enable_event:<system>:<event>\n" +	"\t\t      disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE +	"\t\t      stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"\t\t      snapshot\n" +#endif +	"\t\t      dump\n" +	"\t\t      cpudump\n" +	"\t     example: echo do_fault:traceoff > set_ftrace_filter\n" +	"\t              echo do_trap:traceoff:3 > set_ftrace_filter\n" +	"\t     The first one will disable tracing every time do_fault is hit\n" +	"\t     The second will disable tracing at most 3 times when do_trap is hit\n" +	"\t       The first time do trap is hit and it disables tracing, the\n" +	"\t       counter will decrement to 2. If tracing is already disabled,\n" +	"\t       the counter will not decrement. It only decrements when the\n" +	"\t       trigger did work\n" +	"\t     To remove trigger without count:\n" +	"\t       echo '!<function>:<trigger> > set_ftrace_filter\n" +	"\t     To remove trigger with a count:\n" +	"\t       echo '!<function>:<trigger>:0 > set_ftrace_filter\n" +	"  set_ftrace_notrace\t- echo function name in here to never trace.\n" +	"\t    accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" +	"\t    modules: Can select a group via module command :mod:\n" +	"\t    Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER +	"  set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" +	"\t\t    (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n" +	"  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"\n  snapshot\t\t- Like 'trace' but shows the content of the static\n" +	"\t\t\t  snapshot buffer. Read the contents for more\n" +	"\t\t\t  information\n" +#endif +#ifdef CONFIG_STACK_TRACER +	"  stack_trace\t\t- Shows the max stack trace when active\n" +	"  stack_max_size\t- Shows current max stack size that was traced\n" +	"\t\t\t  Write into this file to reset the max size (trigger a\n" +	"\t\t\t  new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE +	"  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" +	"\t\t\t  traces\n" +#endif +#endif /* CONFIG_STACK_TRACER */ +	"  events/\t\t- Directory containing all trace event subsystems:\n" +	"      enable\t\t- Write 0/1 to enable/disable tracing of all events\n" +	"  events/<system>/\t- Directory containing all trace events for <system>:\n" +	"      enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" +	"\t\t\t  events\n" +	"      filter\t\t- If set, only events passing filter are traced\n" +	"  events/<system>/<event>/\t- Directory containing control files for\n" +	"\t\t\t  <event>:\n" +	"      enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" +	"      filter\t\t- If set, only events passing filter are traced\n" +	"      trigger\t\t- If set, a command to perform when event is hit\n" +	"\t    Format: <trigger>[:count][if <filter>]\n" +	"\t   trigger: traceon, traceoff\n" +	"\t            enable_event:<system>:<event>\n" +	"\t            disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE +	"\t\t    stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT +	"\t\t    snapshot\n" +#endif +	"\t   example: echo traceoff > events/block/block_unplug/trigger\n" +	"\t            echo traceoff:3 > events/block/block_unplug/trigger\n" +	"\t            echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" +	"\t                  events/block/block_unplug/trigger\n" +	"\t   The first disables tracing every time block_unplug is hit.\n" +	"\t   The second disables tracing the first 3 times block_unplug is hit.\n" +	"\t   The third enables the kmalloc event the first 3 times block_unplug\n" +	"\t     is hit and has value of greater than 1 for the 'nr_rq' event field.\n" +	"\t   Like function triggers, the counter is only decremented if it\n" +	"\t    enabled or disabled tracing.\n" +	"\t   To remove a trigger without a count:\n" +	"\t     echo '!<trigger> > <system>/<event>/trigger\n" +	"\t   To remove a trigger with a count:\n" +	"\t     echo '!<trigger>:0 > <system>/<event>/trigger\n" +	"\t   Filters can be ignored when removing a trigger.\n"  ;  static ssize_t @@ -2599,124 +3768,165 @@ static const struct file_operations tracing_readme_fops = {  	.llseek		= generic_file_llseek,  }; -static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, -				size_t cnt, loff_t *ppos) +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)  { -	char *buf_comm; -	char *file_buf; -	char *buf; -	int len = 0; -	int pid; -	int i; +	unsigned int *ptr = v; -	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); -	if (!file_buf) -		return -ENOMEM; +	if (*pos || m->count) +		ptr++; -	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); -	if (!buf_comm) { -		kfree(file_buf); -		return -ENOMEM; +	(*pos)++; + +	for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; +	     ptr++) { +		if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) +			continue; + +		return ptr;  	} -	buf = file_buf; +	return NULL; +} -	for (i = 0; i < SAVED_CMDLINES; i++) { -		int r; +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ +	void *v; +	loff_t l = 0; -		pid = map_cmdline_to_pid[i]; -		if (pid == -1 || pid == NO_CMDLINE_MAP) -			continue; +	preempt_disable(); +	arch_spin_lock(&trace_cmdline_lock); -		trace_find_cmdline(pid, buf_comm); -		r = sprintf(buf, "%d %s\n", pid, buf_comm); -		buf += r; -		len += r; +	v = &savedcmd->map_cmdline_to_pid[0]; +	while (l <= *pos) { +		v = saved_cmdlines_next(m, v, &l); +		if (!v) +			return NULL;  	} -	len = simple_read_from_buffer(ubuf, cnt, ppos, -				      file_buf, len); +	return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ +	arch_spin_unlock(&trace_cmdline_lock); +	preempt_enable(); +} -	kfree(file_buf); -	kfree(buf_comm); +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ +	char buf[TASK_COMM_LEN]; +	unsigned int *pid = v; -	return len; +	__trace_find_cmdline(*pid, buf); +	seq_printf(m, "%d %s\n", *pid, buf); +	return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { +	.start		= saved_cmdlines_start, +	.next		= saved_cmdlines_next, +	.stop		= saved_cmdlines_stop, +	.show		= saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ +	if (tracing_disabled) +		return -ENODEV; + +	return seq_open(filp, &tracing_saved_cmdlines_seq_ops);  }  static const struct file_operations tracing_saved_cmdlines_fops = { -    .open       = tracing_open_generic, -    .read       = tracing_saved_cmdlines_read, -    .llseek	= generic_file_llseek, +	.open		= tracing_saved_cmdlines_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release,  };  static ssize_t -tracing_ctrl_read(struct file *filp, char __user *ubuf, -		  size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, +				 size_t cnt, loff_t *ppos)  {  	char buf[64];  	int r; -	r = sprintf(buf, "%u\n", tracer_enabled); +	arch_spin_lock(&trace_cmdline_lock); +	r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); +	arch_spin_unlock(&trace_cmdline_lock); +  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);  } +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ +	kfree(s->saved_cmdlines); +	kfree(s->map_cmdline_to_pid); +	kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ +	struct saved_cmdlines_buffer *s, *savedcmd_temp; + +	s = kmalloc(sizeof(*s), GFP_KERNEL); +	if (!s) +		return -ENOMEM; + +	if (allocate_cmdlines_buffer(val, s) < 0) { +		kfree(s); +		return -ENOMEM; +	} + +	arch_spin_lock(&trace_cmdline_lock); +	savedcmd_temp = savedcmd; +	savedcmd = s; +	arch_spin_unlock(&trace_cmdline_lock); +	free_saved_cmdlines_buffer(savedcmd_temp); + +	return 0; +} +  static ssize_t -tracing_ctrl_write(struct file *filp, const char __user *ubuf, -		   size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, +				  size_t cnt, loff_t *ppos)  { -	struct trace_array *tr = filp->private_data; -	char buf[64];  	unsigned long val;  	int ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; -	buf[cnt] = 0; +	/* must have at least 1 entry or less than PID_MAX_DEFAULT */ +	if (!val || val > PID_MAX_DEFAULT) +		return -EINVAL; -	ret = strict_strtoul(buf, 10, &val); +	ret = tracing_resize_saved_cmdlines((unsigned int)val);  	if (ret < 0)  		return ret; -	val = !!val; - -	mutex_lock(&trace_types_lock); -	if (tracer_enabled ^ val) { -		if (val) { -			tracer_enabled = 1; -			if (current_trace->start) -				current_trace->start(tr); -			tracing_start(); -		} else { -			tracer_enabled = 0; -			tracing_stop(); -			if (current_trace->stop) -				current_trace->stop(tr); -		} -	} -	mutex_unlock(&trace_types_lock); -  	*ppos += cnt;  	return cnt;  } +static const struct file_operations tracing_saved_cmdlines_size_fops = { +	.open		= tracing_open_generic, +	.read		= tracing_saved_cmdlines_size_read, +	.write		= tracing_saved_cmdlines_size_write, +}; +  static ssize_t  tracing_set_trace_read(struct file *filp, char __user *ubuf,  		       size_t cnt, loff_t *ppos)  { +	struct trace_array *tr = filp->private_data;  	char buf[MAX_TRACER_SIZE+2];  	int r;  	mutex_lock(&trace_types_lock); -	if (current_trace) -		r = sprintf(buf, "%s\n", current_trace->name); -	else -		r = sprintf(buf, "\n"); +	r = sprintf(buf, "%s\n", tr->current_trace->name);  	mutex_unlock(&trace_types_lock);  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -2724,11 +3934,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,  int tracer_init(struct tracer *t, struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	return t->init(tr);  } -static int tracing_resize_ring_buffer(unsigned long size) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) +{ +	int cpu; + +	for_each_tracing_cpu(cpu) +		per_cpu_ptr(buf->data, cpu)->entries = val; +} + +#ifdef CONFIG_TRACER_MAX_TRACE +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, +					struct trace_buffer *size_buf, int cpu_id) +{ +	int cpu, ret = 0; + +	if (cpu_id == RING_BUFFER_ALL_CPUS) { +		for_each_tracing_cpu(cpu) { +			ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu)->entries, cpu); +			if (ret < 0) +				break; +			per_cpu_ptr(trace_buf->data, cpu)->entries = +				per_cpu_ptr(size_buf->data, cpu)->entries; +		} +	} else { +		ret = ring_buffer_resize(trace_buf->buffer, +				 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); +		if (ret == 0) +			per_cpu_ptr(trace_buf->data, cpu_id)->entries = +				per_cpu_ptr(size_buf->data, cpu_id)->entries; +	} + +	return ret; +} +#endif /* CONFIG_TRACER_MAX_TRACE */ + +static int __tracing_resize_ring_buffer(struct trace_array *tr, +					unsigned long size, int cpu)  {  	int ret; @@ -2737,21 +3984,25 @@ static int tracing_resize_ring_buffer(unsigned long size)  	 * we use the size that was given, and we can forget about  	 * expanding it later.  	 */ -	ring_buffer_expanded = 1; +	ring_buffer_expanded = true; -	ret = ring_buffer_resize(global_trace.buffer, size); +	/* May be called before buffers are initialized */ +	if (!tr->trace_buffer.buffer) +		return 0; + +	ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);  	if (ret < 0)  		return ret; -	if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE +	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || +	    !tr->current_trace->use_max_tr)  		goto out; -	ret = ring_buffer_resize(max_tr.buffer, size); +	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);  	if (ret < 0) { -		int r; - -		r = ring_buffer_resize(global_trace.buffer, -				       global_trace.entries); +		int r = resize_buffer_duplicate_size(&tr->trace_buffer, +						     &tr->trace_buffer, cpu);  		if (r < 0) {  			/*  			 * AARGH! We are left with different @@ -2773,9 +4024,43 @@ static int tracing_resize_ring_buffer(unsigned long size)  		return ret;  	} -	max_tr.entries = size; +	if (cpu == RING_BUFFER_ALL_CPUS) +		set_buffer_entries(&tr->max_buffer, size); +	else +		per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; +   out: -	global_trace.entries = size; +#endif /* CONFIG_TRACER_MAX_TRACE */ + +	if (cpu == RING_BUFFER_ALL_CPUS) +		set_buffer_entries(&tr->trace_buffer, size); +	else +		per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; + +	return ret; +} + +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, +					  unsigned long size, int cpu_id) +{ +	int ret = size; + +	mutex_lock(&trace_types_lock); + +	if (cpu_id != RING_BUFFER_ALL_CPUS) { +		/* make sure, this cpu is enabled in the mask */ +		if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { +			ret = -EINVAL; +			goto out; +		} +	} + +	ret = __tracing_resize_ring_buffer(tr, size, cpu_id); +	if (ret < 0) +		ret = -ENOMEM; + +out: +	mutex_unlock(&trace_types_lock);  	return ret;  } @@ -2797,7 +4082,8 @@ int tracing_update_buffers(void)  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) -		ret = tracing_resize_ring_buffer(trace_buf_size); +		ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, +						RING_BUFFER_ALL_CPUS);  	mutex_unlock(&trace_types_lock);  	return ret; @@ -2806,22 +4092,42 @@ int tracing_update_buffers(void)  struct trace_option_dentry;  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer);  static void  destroy_trace_option_files(struct trace_option_dentry *topts); -static int tracing_set_tracer(const char *buf) +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ +	if (tr->current_trace == &nop_trace) +		return; +	 +	tr->current_trace->enabled--; + +	if (tr->current_trace->reset) +		tr->current_trace->reset(tr); + +	tr->current_trace = &nop_trace; +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf)  {  	static struct trace_option_dentry *topts; -	struct trace_array *tr = &global_trace;  	struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE +	bool had_max_tr; +#endif  	int ret = 0;  	mutex_lock(&trace_types_lock);  	if (!ring_buffer_expanded) { -		ret = tracing_resize_ring_buffer(trace_buf_size); +		ret = __tracing_resize_ring_buffer(tr, trace_buf_size, +						RING_BUFFER_ALL_CPUS);  		if (ret < 0)  			goto out;  		ret = 0; @@ -2835,32 +4141,53 @@ static int tracing_set_tracer(const char *buf)  		ret = -EINVAL;  		goto out;  	} -	if (t == current_trace) +	if (t == tr->current_trace)  		goto out; +	/* Some tracers are only allowed for the top level buffer */ +	if (!trace_ok_for_array(t, tr)) { +		ret = -EINVAL; +		goto out; +	} +  	trace_branch_disable(); -	if (current_trace && current_trace->reset) -		current_trace->reset(tr); -	if (current_trace && current_trace->use_max_tr) { + +	tr->current_trace->enabled--; + +	if (tr->current_trace->reset) +		tr->current_trace->reset(tr); + +	/* Current trace needs to be nop_trace before synchronize_sched */ +	tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE +	had_max_tr = tr->allocated_snapshot; + +	if (had_max_tr && !t->use_max_tr) {  		/* -		 * We don't free the ring buffer. instead, resize it because -		 * The max_tr ring buffer has some state (e.g. ring->clock) and -		 * we want preserve it. +		 * We need to make sure that the update_max_tr sees that +		 * current_trace changed to nop_trace to keep it from +		 * swapping the buffers after we resize it. +		 * The update_max_tr is called from interrupts disabled +		 * so a synchronized_sched() is sufficient.  		 */ -		ring_buffer_resize(max_tr.buffer, 1); -		max_tr.entries = 1; +		synchronize_sched(); +		free_snapshot(tr); +	} +#endif +	/* Currently, only the top instance has options */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { +		destroy_trace_option_files(topts); +		topts = create_trace_option_files(tr, t);  	} -	destroy_trace_option_files(topts); - -	current_trace = t; -	topts = create_trace_option_files(current_trace); -	if (current_trace->use_max_tr) { -		ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); +#ifdef CONFIG_TRACER_MAX_TRACE +	if (t->use_max_tr && !had_max_tr) { +		ret = alloc_snapshot(tr);  		if (ret < 0)  			goto out; -		max_tr.entries = global_trace.entries;  	} +#endif  	if (t->init) {  		ret = tracer_init(t, tr); @@ -2868,6 +4195,8 @@ static int tracing_set_tracer(const char *buf)  			goto out;  	} +	tr->current_trace = t; +	tr->current_trace->enabled++;  	trace_branch_enable(tr);   out:  	mutex_unlock(&trace_types_lock); @@ -2879,6 +4208,7 @@ static ssize_t  tracing_set_trace_write(struct file *filp, const char __user *ubuf,  			size_t cnt, loff_t *ppos)  { +	struct trace_array *tr = filp->private_data;  	char buf[MAX_TRACER_SIZE+1];  	int i;  	size_t ret; @@ -2898,7 +4228,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,  	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)  		buf[i] = 0; -	err = tracing_set_tracer(buf); +	err = tracing_set_tracer(tr, buf);  	if (err)  		return err; @@ -2927,20 +4257,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  {  	unsigned long *ptr = filp->private_data; -	char buf[64];  	unsigned long val;  	int ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret)  		return ret;  	*ptr = val * 1000; @@ -2950,19 +4271,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  static int tracing_open_pipe(struct inode *inode, struct file *filp)  { -	long cpu_file = (long) inode->i_private; +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter;  	int ret = 0;  	if (tracing_disabled)  		return -ENODEV; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	mutex_lock(&trace_types_lock);  	/* create a buffer to store the information to pass to userspace */  	iter = kzalloc(sizeof(*iter), GFP_KERNEL);  	if (!iter) {  		ret = -ENOMEM; +		__trace_array_put(tr);  		goto out;  	} @@ -2975,8 +4300,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  		ret = -ENOMEM;  		goto fail;  	} -	if (current_trace) -		*iter->trace = *current_trace; +	*iter->trace = *tr->current_trace;  	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {  		ret = -ENOMEM; @@ -2989,8 +4313,13 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  	if (trace_flags & TRACE_ITER_LATENCY_FMT)  		iter->iter_flags |= TRACE_FILE_LAT_FMT; -	iter->cpu_file = cpu_file; -	iter->tr = &global_trace; +	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ +	if (trace_clocks[tr->clock_id].in_ns) +		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + +	iter->tr = tr; +	iter->trace_buffer = &tr->trace_buffer; +	iter->cpu_file = tracing_get_cpu(inode);  	mutex_init(&iter->mutex);  	filp->private_data = iter; @@ -3005,6 +4334,7 @@ out:  fail:  	kfree(iter->trace);  	kfree(iter); +	__trace_array_put(tr);  	mutex_unlock(&trace_types_lock);  	return ret;  } @@ -3012,6 +4342,7 @@ fail:  static int tracing_release_pipe(struct inode *inode, struct file *file)  {  	struct trace_iterator *iter = file->private_data; +	struct trace_array *tr = inode->i_private;  	mutex_lock(&trace_types_lock); @@ -3025,66 +4356,41 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)  	kfree(iter->trace);  	kfree(iter); +	trace_array_put(tr); +  	return 0;  }  static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)  { -	struct trace_iterator *iter = filp->private_data; +	/* Iterators are static, they should be filled or empty */ +	if (trace_buffer_iter(iter, iter->cpu_file)) +		return POLLIN | POLLRDNORM; -	if (trace_flags & TRACE_ITER_BLOCK) { +	if (trace_flags & TRACE_ITER_BLOCK)  		/*  		 * Always select as readable when in blocking mode  		 */  		return POLLIN | POLLRDNORM; -	} else { -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; -		poll_wait(filp, &trace_wait, poll_table); -		if (!trace_empty(iter)) -			return POLLIN | POLLRDNORM; - -		return 0; -	} +	else +		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, +					     filp, poll_table);  } - -void default_wait_pipe(struct trace_iterator *iter) +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table)  { -	DEFINE_WAIT(wait); - -	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); - -	if (trace_empty(iter)) -		schedule(); - -	finish_wait(&trace_wait, &wait); -} +	struct trace_iterator *iter = filp->private_data; -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - *  1) the current tracer might hold the runqueue lock when it wakes up - *     a reader, hence a deadlock (sched, function, and function graph tracers) - *  2) the function tracers, trace all functions, we don't want - *     the overhead of calling wake_up and friends - *     (and tracing them too) - * - *     Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ -	set_current_state(TASK_INTERRUPTIBLE); -	/* sleep for 100 msecs, and try again. */ -	schedule_timeout(HZ / 10); +	return trace_poll(iter, filp, poll_table);  }  /* Must be called with trace_types_lock mutex held. */  static int tracing_wait_pipe(struct file *filp)  {  	struct trace_iterator *iter = filp->private_data; +	int ret;  	while (trace_empty(iter)) { @@ -3092,15 +4398,6 @@ static int tracing_wait_pipe(struct file *filp)  			return -EAGAIN;  		} -		mutex_unlock(&iter->mutex); - -		iter->trace->wait_pipe(iter); - -		mutex_lock(&iter->mutex); - -		if (signal_pending(current)) -			return -EINTR; -  		/*  		 * We block until we read something and tracing is disabled.  		 * We still block if tracing is disabled, but we have never @@ -3110,8 +4407,20 @@ static int tracing_wait_pipe(struct file *filp)  		 *  		 * iter->pos will be 0 if we haven't read anything.  		 */ -		if (!tracer_enabled && iter->pos) +		if (!tracing_is_on() && iter->pos)  			break; + +		mutex_unlock(&iter->mutex); + +		ret = wait_on_pipe(iter); + +		mutex_lock(&iter->mutex); + +		if (ret) +			return ret; + +		if (signal_pending(current)) +			return -EINTR;  	}  	return 1; @@ -3125,7 +4434,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  		  size_t cnt, loff_t *ppos)  {  	struct trace_iterator *iter = filp->private_data; -	static struct tracer *old_tracer; +	struct trace_array *tr = iter->tr;  	ssize_t sret;  	/* return any leftover data */ @@ -3137,10 +4446,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(old_tracer != current_trace && current_trace)) { -		old_tracer = current_trace; -		*iter->trace = *current_trace; -	} +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	/* @@ -3173,6 +4480,7 @@ waitagain:  	memset(&iter->seq, 0,  	       sizeof(struct trace_iterator) -  	       offsetof(struct trace_iterator, seq)); +	cpumask_clear(iter->started);  	iter->pos = -1;  	trace_event_read_lock(); @@ -3192,6 +4500,14 @@ waitagain:  		if (iter->seq.len >= cnt)  			break; + +		/* +		 * Setting the full flag means we reached the trace_seq buffer +		 * size and we should leave by partial output condition above. +		 * One of the trace_seq_* functions is not used properly. +		 */ +		WARN_ONCE(iter->seq.full, "full flag set for trace type %d", +			  iter->ent->type);  	}  	trace_access_unlock(iter->cpu_file);  	trace_event_read_unlock(); @@ -3202,7 +4518,7 @@ waitagain:  		trace_seq_init(&iter->seq);  	/* -	 * If there was nothing to send to user, inspite of consuming trace +	 * If there was nothing to send to user, in spite of consuming trace  	 * entries, go back to wait for more entries.  	 */  	if (sret == -EBUSY) @@ -3214,12 +4530,6 @@ out:  	return sret;  } -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, -				     struct pipe_buffer *buf) -{ -	__free_page(buf->page); -} -  static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,  				     unsigned int idx)  { @@ -3228,10 +4538,8 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,  static const struct pipe_buf_operations tracing_pipe_buf_ops = {  	.can_merge		= 0, -	.map			= generic_pipe_buf_map, -	.unmap			= generic_pipe_buf_unmap,  	.confirm		= generic_pipe_buf_confirm, -	.release		= tracing_pipe_buf_release, +	.release		= generic_pipe_buf_release,  	.steal			= generic_pipe_buf_steal,  	.get			= generic_pipe_buf_get,  }; @@ -3283,11 +4591,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		.pages		= pages_def,  		.partial	= partial_def,  		.nr_pages	= 0, /* This gets updated below. */ +		.nr_pages_max	= PIPE_DEF_BUFFERS,  		.flags		= flags,  		.ops		= &tracing_pipe_buf_ops,  		.spd_release	= tracing_spd_release_pipe,  	}; -	static struct tracer *old_tracer; +	struct trace_array *tr = iter->tr;  	ssize_t ret;  	size_t rem;  	unsigned int i; @@ -3297,10 +4606,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	/* copy the tracer to avoid using a global lock all around */  	mutex_lock(&trace_types_lock); -	if (unlikely(old_tracer != current_trace && current_trace)) { -		old_tracer = current_trace; -		*iter->trace = *current_trace; -	} +	if (unlikely(iter->trace->name != tr->current_trace->name)) +		*iter->trace = *tr->current_trace;  	mutex_unlock(&trace_types_lock);  	mutex_lock(&iter->mutex); @@ -3325,7 +4632,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	trace_access_lock(iter->cpu_file);  	/* Fill as many pages as possible. */ -	for (i = 0, rem = len; i < pipe->buffers && rem; i++) { +	for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {  		spd.pages[i] = alloc_page(GFP_KERNEL);  		if (!spd.pages[i])  			break; @@ -3354,7 +4661,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	ret = splice_to_pipe(pipe, &spd);  out: -	splice_shrink_spd(pipe, &spd); +	splice_shrink_spd(&spd);  	return ret;  out_err: @@ -3366,156 +4673,294 @@ static ssize_t  tracing_entries_read(struct file *filp, char __user *ubuf,  		     size_t cnt, loff_t *ppos)  { -	struct trace_array *tr = filp->private_data; -	char buf[96]; -	int r; +	struct inode *inode = file_inode(filp); +	struct trace_array *tr = inode->i_private; +	int cpu = tracing_get_cpu(inode); +	char buf[64]; +	int r = 0; +	ssize_t ret;  	mutex_lock(&trace_types_lock); -	if (!ring_buffer_expanded) -		r = sprintf(buf, "%lu (expanded: %lu)\n", -			    tr->entries >> 10, -			    trace_buf_size >> 10); -	else -		r = sprintf(buf, "%lu\n", tr->entries >> 10); + +	if (cpu == RING_BUFFER_ALL_CPUS) { +		int cpu, buf_size_same; +		unsigned long size; + +		size = 0; +		buf_size_same = 1; +		/* check if all cpu sizes are same */ +		for_each_tracing_cpu(cpu) { +			/* fill in the size from first enabled cpu */ +			if (size == 0) +				size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; +			if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { +				buf_size_same = 0; +				break; +			} +		} + +		if (buf_size_same) { +			if (!ring_buffer_expanded) +				r = sprintf(buf, "%lu (expanded: %lu)\n", +					    size >> 10, +					    trace_buf_size >> 10); +			else +				r = sprintf(buf, "%lu\n", size >> 10); +		} else +			r = sprintf(buf, "X\n"); +	} else +		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); +  	mutex_unlock(&trace_types_lock); -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +	return ret;  }  static ssize_t  tracing_entries_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  { +	struct inode *inode = file_inode(filp); +	struct trace_array *tr = inode->i_private;  	unsigned long val; -	char buf[64]; -	int ret, cpu; - -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; +	int ret; -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret)  		return ret;  	/* must have at least 1 entry */  	if (!val)  		return -EINVAL; -	mutex_lock(&trace_types_lock); - -	tracing_stop(); - -	/* disable all cpu buffers */ -	for_each_tracing_cpu(cpu) { -		if (global_trace.data[cpu]) -			atomic_inc(&global_trace.data[cpu]->disabled); -		if (max_tr.data[cpu]) -			atomic_inc(&max_tr.data[cpu]->disabled); -	} -  	/* value is in KB */  	val <<= 10; - -	if (val != global_trace.entries) { -		ret = tracing_resize_ring_buffer(val); -		if (ret < 0) { -			cnt = ret; -			goto out; -		} -	} +	ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); +	if (ret < 0) +		return ret;  	*ppos += cnt; -	/* If check pages failed, return ENOMEM */ -	if (tracing_disabled) -		cnt = -ENOMEM; - out: +	return cnt; +} + +static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, +				size_t cnt, loff_t *ppos) +{ +	struct trace_array *tr = filp->private_data; +	char buf[64]; +	int r, cpu; +	unsigned long size = 0, expanded_size = 0; + +	mutex_lock(&trace_types_lock);  	for_each_tracing_cpu(cpu) { -		if (global_trace.data[cpu]) -			atomic_dec(&global_trace.data[cpu]->disabled); -		if (max_tr.data[cpu]) -			atomic_dec(&max_tr.data[cpu]->disabled); +		size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; +		if (!ring_buffer_expanded) +			expanded_size += trace_buf_size >> 10;  	} - -	tracing_start(); +	if (ring_buffer_expanded) +		r = sprintf(buf, "%lu\n", size); +	else +		r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);  	mutex_unlock(&trace_types_lock); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, +			  size_t cnt, loff_t *ppos) +{ +	/* +	 * There is no need to read what the user has written, this function +	 * is just to make sure that there is no error when "echo" is used +	 */ + +	*ppos += cnt; +  	return cnt;  } -static int mark_printk(const char *fmt, ...) +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp)  { -	int ret; -	va_list args; -	va_start(args, fmt); -	ret = trace_vprintk(0, fmt, args); -	va_end(args); -	return ret; +	struct trace_array *tr = inode->i_private; + +	/* disable tracing ? */ +	if (trace_flags & TRACE_ITER_STOP_ON_FREE) +		tracer_tracing_off(tr); +	/* resize the ring buffer to 0 */ +	tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + +	trace_array_put(tr); + +	return 0;  }  static ssize_t  tracing_mark_write(struct file *filp, const char __user *ubuf,  					size_t cnt, loff_t *fpos)  { -	char *buf; -	size_t written; +	unsigned long addr = (unsigned long)ubuf; +	struct trace_array *tr = filp->private_data; +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	struct print_entry *entry; +	unsigned long irq_flags; +	struct page *pages[2]; +	void *map_page[2]; +	int nr_pages = 1; +	ssize_t written; +	int offset; +	int size; +	int len; +	int ret; +	int i;  	if (tracing_disabled)  		return -EINVAL; +	if (!(trace_flags & TRACE_ITER_MARKERS)) +		return -EINVAL; +  	if (cnt > TRACE_BUF_SIZE)  		cnt = TRACE_BUF_SIZE; -	buf = kmalloc(cnt + 2, GFP_KERNEL); -	if (buf == NULL) -		return -ENOMEM; +	/* +	 * Userspace is injecting traces into the kernel trace buffer. +	 * We want to be as non intrusive as possible. +	 * To do so, we do not want to allocate any special buffers +	 * or take any locks, but instead write the userspace data +	 * straight into the ring buffer. +	 * +	 * First we need to pin the userspace buffer into memory, +	 * which, most likely it is, because it just referenced it. +	 * But there's no guarantee that it is. By using get_user_pages_fast() +	 * and kmap_atomic/kunmap_atomic() we can get access to the +	 * pages directly. We then write the data directly into the +	 * ring buffer. +	 */ +	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); -	if (copy_from_user(buf, ubuf, cnt)) { -		kfree(buf); -		return -EFAULT; +	/* check if we cross pages */ +	if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) +		nr_pages = 2; + +	offset = addr & (PAGE_SIZE - 1); +	addr &= PAGE_MASK; + +	ret = get_user_pages_fast(addr, nr_pages, 0, pages); +	if (ret < nr_pages) { +		while (--ret >= 0) +			put_page(pages[ret]); +		written = -EFAULT; +		goto out; +	} + +	for (i = 0; i < nr_pages; i++) +		map_page[i] = kmap_atomic(pages[i]); + +	local_save_flags(irq_flags); +	size = sizeof(*entry) + cnt + 2; /* possible \n added */ +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, +					  irq_flags, preempt_count()); +	if (!event) { +		/* Ring buffer disabled, return as if not open for write */ +		written = -EBADF; +		goto out_unlock;  	} -	if (buf[cnt-1] != '\n') { -		buf[cnt] = '\n'; -		buf[cnt+1] = '\0'; + +	entry = ring_buffer_event_data(event); +	entry->ip = _THIS_IP_; + +	if (nr_pages == 2) { +		len = PAGE_SIZE - offset; +		memcpy(&entry->buf, map_page[0] + offset, len); +		memcpy(&entry->buf[len], map_page[1], cnt - len);  	} else -		buf[cnt] = '\0'; +		memcpy(&entry->buf, map_page[0] + offset, cnt); -	written = mark_printk("%s", buf); -	kfree(buf); -	*fpos += written; +	if (entry->buf[cnt - 1] != '\n') { +		entry->buf[cnt] = '\n'; +		entry->buf[cnt + 1] = '\0'; +	} else +		entry->buf[cnt] = '\0'; + +	__buffer_unlock_commit(buffer, event); -	/* don't tell userspace we wrote more - it might confuse them */ -	if (written > cnt) -		written = cnt; +	written = cnt; + +	*fpos += written; + out_unlock: +	for (i = 0; i < nr_pages; i++){ +		kunmap_atomic(map_page[i]); +		put_page(pages[i]); +	} + out:  	return written;  }  static int tracing_clock_show(struct seq_file *m, void *v)  { +	struct trace_array *tr = m->private;  	int i;  	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)  		seq_printf(m,  			"%s%s%s%s", i ? " " : "", -			i == trace_clock_id ? "[" : "", trace_clocks[i].name, -			i == trace_clock_id ? "]" : ""); +			i == tr->clock_id ? "[" : "", trace_clocks[i].name, +			i == tr->clock_id ? "]" : "");  	seq_putc(m, '\n');  	return 0;  } +static int tracing_set_clock(struct trace_array *tr, const char *clockstr) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { +		if (strcmp(trace_clocks[i].name, clockstr) == 0) +			break; +	} +	if (i == ARRAY_SIZE(trace_clocks)) +		return -EINVAL; + +	mutex_lock(&trace_types_lock); + +	tr->clock_id = i; + +	ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); + +	/* +	 * New clock may not be consistent with the previous clock. +	 * Reset the buffer so that it doesn't have incomparable timestamps. +	 */ +	tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) +		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); +	tracing_reset_online_cpus(&tr->max_buffer); +#endif + +	mutex_unlock(&trace_types_lock); + +	return 0; +} +  static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  				   size_t cnt, loff_t *fpos)  { +	struct seq_file *m = filp->private_data; +	struct trace_array *tr = m->private;  	char buf[64];  	const char *clockstr; -	int i; +	int ret;  	if (cnt >= sizeof(buf))  		return -EINVAL; @@ -3527,35 +4972,204 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  	clockstr = strstrip(buf); -	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { -		if (strcmp(trace_clocks[i].name, clockstr) == 0) -			break; +	ret = tracing_set_clock(tr, clockstr); +	if (ret) +		return ret; + +	*fpos += cnt; + +	return cnt; +} + +static int tracing_clock_open(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; +	int ret; + +	if (tracing_disabled) +		return -ENODEV; + +	if (trace_array_get(tr)) +		return -ENODEV; + +	ret = single_open(file, tracing_clock_show, inode->i_private); +	if (ret < 0) +		trace_array_put(tr); + +	return ret; +} + +struct ftrace_buffer_info { +	struct trace_iterator	iter; +	void			*spare; +	unsigned int		read; +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; +	struct trace_iterator *iter; +	struct seq_file *m; +	int ret = 0; + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	if (file->f_mode & FMODE_READ) { +		iter = __tracing_open(inode, file, true); +		if (IS_ERR(iter)) +			ret = PTR_ERR(iter); +	} else { +		/* Writes still need the seq_file to hold the private data */ +		ret = -ENOMEM; +		m = kzalloc(sizeof(*m), GFP_KERNEL); +		if (!m) +			goto out; +		iter = kzalloc(sizeof(*iter), GFP_KERNEL); +		if (!iter) { +			kfree(m); +			goto out; +		} +		ret = 0; + +		iter->tr = tr; +		iter->trace_buffer = &tr->max_buffer; +		iter->cpu_file = tracing_get_cpu(inode); +		m->private = iter; +		file->private_data = m;  	} -	if (i == ARRAY_SIZE(trace_clocks)) -		return -EINVAL; +out: +	if (ret < 0) +		trace_array_put(tr); -	trace_clock_id = i; +	return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, +		       loff_t *ppos) +{ +	struct seq_file *m = filp->private_data; +	struct trace_iterator *iter = m->private; +	struct trace_array *tr = iter->tr; +	unsigned long val; +	int ret; + +	ret = tracing_update_buffers(); +	if (ret < 0) +		return ret; + +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret;  	mutex_lock(&trace_types_lock); -	ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); -	if (max_tr.buffer) -		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); +	if (tr->current_trace->use_max_tr) { +		ret = -EBUSY; +		goto out; +	} + +	switch (val) { +	case 0: +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break; +		} +		if (tr->allocated_snapshot) +			free_snapshot(tr); +		break; +	case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP +		if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { +			ret = -EINVAL; +			break; +		} +#endif +		if (!tr->allocated_snapshot) { +			ret = alloc_snapshot(tr); +			if (ret < 0) +				break; +		} +		local_irq_disable(); +		/* Now, we're going to swap */ +		if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +			update_max_tr(tr, current, smp_processor_id()); +		else +			update_max_tr_single(tr, current, iter->cpu_file); +		local_irq_enable(); +		break; +	default: +		if (tr->allocated_snapshot) { +			if (iter->cpu_file == RING_BUFFER_ALL_CPUS) +				tracing_reset_online_cpus(&tr->max_buffer); +			else +				tracing_reset(&tr->max_buffer, iter->cpu_file); +		} +		break; +	} +	if (ret >= 0) { +		*ppos += cnt; +		ret = cnt; +	} +out:  	mutex_unlock(&trace_types_lock); +	return ret; +} -	*fpos += cnt; +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ +	struct seq_file *m = file->private_data; +	int ret; -	return cnt; +	ret = tracing_release(inode, file); + +	if (file->f_mode & FMODE_READ) +		return ret; + +	/* If write only, the seq_file is just a stub */ +	if (m) +		kfree(m->private); +	kfree(m); + +	return 0;  } -static int tracing_clock_open(struct inode *inode, struct file *file) +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, +				    size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, +		   struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp)  { -	if (tracing_disabled) -		return -ENODEV; -	return single_open(file, tracing_clock_show, NULL); +	struct ftrace_buffer_info *info; +	int ret; + +	ret = tracing_buffers_open(inode, filp); +	if (ret < 0) +		return ret; + +	info = filp->private_data; + +	if (info->iter.trace->use_max_tr) { +		tracing_buffers_release(inode, filp); +		return -EBUSY; +	} + +	info->iter.snapshot = true; +	info->iter.trace_buffer = &info->iter.tr->max_buffer; + +	return ret;  } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +  static const struct file_operations tracing_max_lat_fops = {  	.open		= tracing_open_generic,  	.read		= tracing_max_lat_read, @@ -3563,13 +5177,6 @@ static const struct file_operations tracing_max_lat_fops = {  	.llseek		= generic_file_llseek,  }; -static const struct file_operations tracing_ctrl_fops = { -	.open		= tracing_open_generic, -	.read		= tracing_ctrl_read, -	.write		= tracing_ctrl_write, -	.llseek		= generic_file_llseek, -}; -  static const struct file_operations set_tracer_fops = {  	.open		= tracing_open_generic,  	.read		= tracing_set_trace_read, @@ -3587,54 +5194,106 @@ static const struct file_operations tracing_pipe_fops = {  };  static const struct file_operations tracing_entries_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= tracing_entries_read,  	.write		= tracing_entries_write,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr, +}; + +static const struct file_operations tracing_total_entries_fops = { +	.open		= tracing_open_generic_tr, +	.read		= tracing_total_entries_read, +	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr, +}; + +static const struct file_operations tracing_free_buffer_fops = { +	.open		= tracing_open_generic_tr, +	.write		= tracing_free_buffer_write, +	.release	= tracing_free_buffer_release,  };  static const struct file_operations tracing_mark_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.write		= tracing_mark_write,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr,  };  static const struct file_operations trace_clock_fops = {  	.open		= tracing_clock_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= tracing_single_release_tr,  	.write		= tracing_clock_write,  }; -struct ftrace_buffer_info { -	struct trace_array	*tr; -	void			*spare; -	int			cpu; -	unsigned int		read; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { +	.open		= tracing_snapshot_open, +	.read		= seq_read, +	.write		= tracing_snapshot_write, +	.llseek		= tracing_lseek, +	.release	= tracing_snapshot_release,  }; +static const struct file_operations snapshot_raw_fops = { +	.open		= snapshot_raw_open, +	.read		= tracing_buffers_read, +	.release	= tracing_buffers_release, +	.splice_read	= tracing_buffers_splice_read, +	.llseek		= no_llseek, +}; + +#endif /* CONFIG_TRACER_SNAPSHOT */ +  static int tracing_buffers_open(struct inode *inode, struct file *filp)  { -	int cpu = (int)(long)inode->i_private; +	struct trace_array *tr = inode->i_private;  	struct ftrace_buffer_info *info; +	int ret;  	if (tracing_disabled)  		return -ENODEV; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	info = kzalloc(sizeof(*info), GFP_KERNEL); -	if (!info) +	if (!info) { +		trace_array_put(tr);  		return -ENOMEM; +	} -	info->tr	= &global_trace; -	info->cpu	= cpu; -	info->spare	= NULL; +	mutex_lock(&trace_types_lock); + +	info->iter.tr		= tr; +	info->iter.cpu_file	= tracing_get_cpu(inode); +	info->iter.trace	= tr->current_trace; +	info->iter.trace_buffer = &tr->trace_buffer; +	info->spare		= NULL;  	/* Force reading ring buffer for first read */ -	info->read	= (unsigned int)-1; +	info->read		= (unsigned int)-1;  	filp->private_data = info; -	return nonseekable_open(inode, filp); +	mutex_unlock(&trace_types_lock); + +	ret = nonseekable_open(inode, filp); +	if (ret < 0) +		trace_array_put(tr); + +	return ret; +} + +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ +	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter; + +	return trace_poll(iter, filp, poll_table);  }  static ssize_t @@ -3642,56 +5301,101 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,  		     size_t count, loff_t *ppos)  {  	struct ftrace_buffer_info *info = filp->private_data; +	struct trace_iterator *iter = &info->iter;  	ssize_t ret; -	size_t size; +	ssize_t size;  	if (!count)  		return 0; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		size = -EBUSY; +		goto out_unlock; +	} +#endif +  	if (!info->spare) -		info->spare = ring_buffer_alloc_read_page(info->tr->buffer); +		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, +							  iter->cpu_file); +	size = -ENOMEM;  	if (!info->spare) -		return -ENOMEM; +		goto out_unlock;  	/* Do we have previous read data to read? */  	if (info->read < PAGE_SIZE)  		goto read; -	info->read = 0; - -	trace_access_lock(info->cpu); -	ret = ring_buffer_read_page(info->tr->buffer, + again: +	trace_access_lock(iter->cpu_file); +	ret = ring_buffer_read_page(iter->trace_buffer->buffer,  				    &info->spare,  				    count, -				    info->cpu, 0); -	trace_access_unlock(info->cpu); -	if (ret < 0) -		return 0; +				    iter->cpu_file, 0); +	trace_access_unlock(iter->cpu_file); -read: +	if (ret < 0) { +		if (trace_empty(iter)) { +			if ((filp->f_flags & O_NONBLOCK)) { +				size = -EAGAIN; +				goto out_unlock; +			} +			mutex_unlock(&trace_types_lock); +			ret = wait_on_pipe(iter); +			mutex_lock(&trace_types_lock); +			if (ret) { +				size = ret; +				goto out_unlock; +			} +			if (signal_pending(current)) { +				size = -EINTR; +				goto out_unlock; +			} +			goto again; +		} +		size = 0; +		goto out_unlock; +	} + +	info->read = 0; + read:  	size = PAGE_SIZE - info->read;  	if (size > count)  		size = count;  	ret = copy_to_user(ubuf, info->spare + info->read, size); -	if (ret == size) -		return -EFAULT; +	if (ret == size) { +		size = -EFAULT; +		goto out_unlock; +	}  	size -= ret;  	*ppos += size;  	info->read += size; + out_unlock: +	mutex_unlock(&trace_types_lock); +  	return size;  }  static int tracing_buffers_release(struct inode *inode, struct file *file)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter; + +	mutex_lock(&trace_types_lock); + +	__trace_array_put(iter->tr);  	if (info->spare) -		ring_buffer_free_read_page(info->tr->buffer, info->spare); +		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);  	kfree(info); +	mutex_unlock(&trace_types_lock); +  	return 0;  } @@ -3714,12 +5418,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,  	buf->private = 0;  } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, -				 struct pipe_buffer *buf) -{ -	return 1; -} -  static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,  				struct pipe_buffer *buf)  { @@ -3731,11 +5429,9 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,  /* Pipe buffer operations for a buffer. */  static const struct pipe_buf_operations buffer_pipe_buf_ops = {  	.can_merge		= 0, -	.map			= generic_pipe_buf_map, -	.unmap			= generic_pipe_buf_unmap,  	.confirm		= generic_pipe_buf_confirm,  	.release		= buffer_pipe_buf_release, -	.steal			= buffer_pipe_buf_steal, +	.steal			= generic_pipe_buf_steal,  	.get			= buffer_pipe_buf_get,  }; @@ -3762,30 +5458,41 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			    unsigned int flags)  {  	struct ftrace_buffer_info *info = file->private_data; +	struct trace_iterator *iter = &info->iter;  	struct partial_page partial_def[PIPE_DEF_BUFFERS];  	struct page *pages_def[PIPE_DEF_BUFFERS];  	struct splice_pipe_desc spd = {  		.pages		= pages_def,  		.partial	= partial_def, +		.nr_pages_max	= PIPE_DEF_BUFFERS,  		.flags		= flags,  		.ops		= &buffer_pipe_buf_ops,  		.spd_release	= buffer_spd_release,  	};  	struct buffer_ref *ref;  	int entries, size, i; -	size_t ret; +	ssize_t ret; -	if (splice_grow_spd(pipe, &spd)) -		return -ENOMEM; +	mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE +	if (iter->snapshot && iter->tr->current_trace->use_max_tr) { +		ret = -EBUSY; +		goto out; +	} +#endif + +	if (splice_grow_spd(pipe, &spd)) { +		ret = -ENOMEM; +		goto out; +	}  	if (*ppos & (PAGE_SIZE - 1)) { -		WARN_ONCE(1, "Ftrace: previous read must page-align\n");  		ret = -EINVAL;  		goto out;  	}  	if (len & (PAGE_SIZE - 1)) { -		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");  		if (len < PAGE_SIZE) {  			ret = -EINVAL;  			goto out; @@ -3793,10 +5500,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		len &= PAGE_MASK;  	} -	trace_access_lock(info->cpu); -	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: +	trace_access_lock(iter->cpu_file); +	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); -	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { +	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {  		struct page *page;  		int r; @@ -3805,18 +5513,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  			break;  		ref->ref = 1; -		ref->buffer = info->tr->buffer; -		ref->page = ring_buffer_alloc_read_page(ref->buffer); +		ref->buffer = iter->trace_buffer->buffer; +		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);  		if (!ref->page) {  			kfree(ref);  			break;  		}  		r = ring_buffer_read_page(ref->buffer, &ref->page, -					  len, info->cpu, 1); +					  len, iter->cpu_file, 1);  		if (r < 0) { -			ring_buffer_free_read_page(ref->buffer, -						   ref->page); +			ring_buffer_free_read_page(ref->buffer, ref->page);  			kfree(ref);  			break;  		} @@ -3838,31 +5545,42 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  		spd.nr_pages++;  		*ppos += PAGE_SIZE; -		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); +		entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);  	} -	trace_access_unlock(info->cpu); +	trace_access_unlock(iter->cpu_file);  	spd.nr_pages = i;  	/* did we read anything? */  	if (!spd.nr_pages) { -		if (flags & SPLICE_F_NONBLOCK) +		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {  			ret = -EAGAIN; -		else -			ret = 0; -		/* TODO: block */ -		goto out; +			goto out; +		} +		mutex_unlock(&trace_types_lock); +		ret = wait_on_pipe(iter); +		mutex_lock(&trace_types_lock); +		if (ret) +			goto out; +		if (signal_pending(current)) { +			ret = -EINTR; +			goto out; +		} +		goto again;  	}  	ret = splice_to_pipe(pipe, &spd); -	splice_shrink_spd(pipe, &spd); +	splice_shrink_spd(&spd);  out: +	mutex_unlock(&trace_types_lock); +  	return ret;  }  static const struct file_operations tracing_buffers_fops = {  	.open		= tracing_buffers_open,  	.read		= tracing_buffers_read, +	.poll		= tracing_buffers_poll,  	.release	= tracing_buffers_release,  	.splice_read	= tracing_buffers_splice_read,  	.llseek		= no_llseek, @@ -3872,10 +5590,14 @@ static ssize_t  tracing_stats_read(struct file *filp, char __user *ubuf,  		   size_t count, loff_t *ppos)  { -	unsigned long cpu = (unsigned long)filp->private_data; -	struct trace_array *tr = &global_trace; +	struct inode *inode = file_inode(filp); +	struct trace_array *tr = inode->i_private; +	struct trace_buffer *trace_buf = &tr->trace_buffer; +	int cpu = tracing_get_cpu(inode);  	struct trace_seq *s;  	unsigned long cnt; +	unsigned long long t; +	unsigned long usec_rem;  	s = kmalloc(sizeof(*s), GFP_KERNEL);  	if (!s) @@ -3883,15 +5605,43 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  	trace_seq_init(s); -	cnt = ring_buffer_entries_cpu(tr->buffer, cpu); +	cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "entries: %ld\n", cnt); -	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "overrun: %ld\n", cnt); -	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); +	cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);  	trace_seq_printf(s, "commit overrun: %ld\n", cnt); +	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); +	trace_seq_printf(s, "bytes: %ld\n", cnt); + +	if (trace_clocks[tr->clock_id].in_ns) { +		/* local or global for trace_clock */ +		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); +		usec_rem = do_div(t, USEC_PER_SEC); +		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", +								t, usec_rem); + +		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); +		usec_rem = do_div(t, USEC_PER_SEC); +		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); +	} else { +		/* counter or tsc mode for trace_clock */ +		trace_seq_printf(s, "oldest event ts: %llu\n", +				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + +		trace_seq_printf(s, "now ts: %llu\n", +				ring_buffer_time_stamp(trace_buf->buffer, cpu)); +	} + +	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); +	trace_seq_printf(s, "dropped events: %ld\n", cnt); + +	cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); +	trace_seq_printf(s, "read events: %ld\n", cnt); +  	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);  	kfree(s); @@ -3900,9 +5650,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  }  static const struct file_operations tracing_stats_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= tracing_stats_read,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr,  };  #ifdef CONFIG_DYNAMIC_FTRACE @@ -3941,63 +5692,177 @@ static const struct file_operations tracing_dyn_info_fops = {  	.read		= tracing_read_dyn_info,  	.llseek		= generic_file_llseek,  }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ +	tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)  { -	static int once; +	unsigned long *count = (long *)data; -	if (d_tracer) -		return d_tracer; +	if (!*count) +		return; + +	if (*count != -1) +		(*count)--; + +	tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *data) +{ +	long count = (long)data; + +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "snapshot"); + +	if (count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", count); + +	return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { +	.func			= ftrace_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { +	.func			= ftrace_count_snapshot, +	.print			= ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +			       char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; +	void *count = (void *)-1; +	char *number; +	int ret; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enable) +		return -EINVAL; + +	ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	} + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	if (!strlen(number)) +		goto out_reg; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, (unsigned long *)&count); +	if (ret) +		return ret; + + out_reg: +	ret = register_ftrace_function_probe(glob, ops, count); + +	if (ret >= 0) +		alloc_snapshot(&global_trace); + +	return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { +	.name			= "snapshot", +	.func			= ftrace_trace_snapshot_callback, +}; + +static __init int register_snapshot_cmd(void) +{ +	return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline __init int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ + +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ +	if (tr->dir) +		return tr->dir;  	if (!debugfs_initialized())  		return NULL; -	d_tracer = debugfs_create_dir("tracing", NULL); +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		tr->dir = debugfs_create_dir("tracing", NULL); -	if (!d_tracer && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'tracing'\n"); -		return NULL; -	} +	if (!tr->dir) +		pr_warn_once("Could not create debugfs directory 'tracing'\n"); -	return d_tracer; +	return tr->dir;  } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ +	return tracing_init_dentry_tr(&global_trace); +} -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)  { -	static int once;  	struct dentry *d_tracer; -	if (d_percpu) -		return d_percpu; - -	d_tracer = tracing_init_dentry(); +	if (tr->percpu_dir) +		return tr->percpu_dir; +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	d_percpu = debugfs_create_dir("per_cpu", d_tracer); +	tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); -	if (!d_percpu && !once) { -		once = 1; -		pr_warning("Could not create debugfs directory 'per_cpu'\n"); -		return NULL; -	} +	WARN_ONCE(!tr->percpu_dir, +		  "Could not create debugfs directory 'per_cpu/%d'\n", cpu); -	return d_percpu; +	return tr->percpu_dir;  } -static void tracing_init_debugfs_percpu(long cpu) +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, +		      void *data, long cpu, const struct file_operations *fops)  { -	struct dentry *d_percpu = tracing_dentry_percpu(); +	struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + +	if (ret) /* See tracing_get_cpu() */ +		ret->d_inode->i_cdev = (void *)(cpu + 1); +	return ret; +} + +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) +{ +	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);  	struct dentry *d_cpu;  	char cpu_dir[30]; /* 30 characters should be more than enough */ +	if (!d_percpu) +		return; +  	snprintf(cpu_dir, 30, "cpu%ld", cpu);  	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);  	if (!d_cpu) { @@ -4006,18 +5871,29 @@ static void tracing_init_debugfs_percpu(long cpu)  	}  	/* per cpu trace_pipe */ -	trace_create_file("trace_pipe", 0444, d_cpu, -			(void *) cpu, &tracing_pipe_fops); +	trace_create_cpu_file("trace_pipe", 0444, d_cpu, +				tr, cpu, &tracing_pipe_fops);  	/* per cpu trace */ -	trace_create_file("trace", 0644, d_cpu, -			(void *) cpu, &tracing_fops); +	trace_create_cpu_file("trace", 0644, d_cpu, +				tr, cpu, &tracing_fops); + +	trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, +				tr, cpu, &tracing_buffers_fops); + +	trace_create_cpu_file("stats", 0444, d_cpu, +				tr, cpu, &tracing_stats_fops); -	trace_create_file("trace_pipe_raw", 0444, d_cpu, -			(void *) cpu, &tracing_buffers_fops); +	trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, +				tr, cpu, &tracing_entries_fops); -	trace_create_file("stats", 0444, d_cpu, -			(void *) cpu, &tracing_stats_fops); +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_cpu_file("snapshot", 0644, d_cpu, +				tr, cpu, &snapshot_fops); + +	trace_create_cpu_file("snapshot_raw", 0444, d_cpu, +				tr, cpu, &snapshot_raw_fops); +#endif  }  #ifdef CONFIG_FTRACE_SELFTEST @@ -4028,6 +5904,7 @@ static void tracing_init_debugfs_percpu(long cpu)  struct trace_option_dentry {  	struct tracer_opt		*opt;  	struct tracer_flags		*flags; +	struct trace_array		*tr;  	struct dentry			*entry;  }; @@ -4052,19 +5929,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,  {  	struct trace_option_dentry *topt = filp->private_data;  	unsigned long val; -	char buf[64];  	int ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret)  		return ret;  	if (val != 0 && val != 1) @@ -4072,7 +5940,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (!!(topt->flags->val & topt->opt->bit) != val) {  		mutex_lock(&trace_types_lock); -		ret = __set_tracer_option(current_trace, topt->flags, +		ret = __set_tracer_option(topt->tr, topt->flags,  					  topt->opt, !val);  		mutex_unlock(&trace_types_lock);  		if (ret) @@ -4111,27 +5979,25 @@ static ssize_t  trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,  			 loff_t *ppos)  { +	struct trace_array *tr = &global_trace;  	long index = (long)filp->private_data; -	char buf[64];  	unsigned long val;  	int ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; +	if (val != 0 && val != 1) +		return -EINVAL; -	buf[cnt] = 0; +	mutex_lock(&trace_types_lock); +	ret = set_tracer_flag(tr, 1 << index, val); +	mutex_unlock(&trace_types_lock); -	ret = strict_strtoul(buf, 10, &val);  	if (ret < 0)  		return ret; -	if (val != 0 && val != 1) -		return -EINVAL; -	set_tracer_flags(1 << index, val); -  	*ppos += cnt;  	return cnt; @@ -4145,7 +6011,7 @@ static const struct file_operations trace_options_core_fops = {  };  struct dentry *trace_create_file(const char *name, -				 mode_t mode, +				 umode_t mode,  				 struct dentry *parent,  				 void *data,  				 const struct file_operations *fops) @@ -4160,40 +6026,41 @@ struct dentry *trace_create_file(const char *name,  } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr)  {  	struct dentry *d_tracer; -	static struct dentry *t_options; -	if (t_options) -		return t_options; +	if (tr->options) +		return tr->options; -	d_tracer = tracing_init_dentry(); +	d_tracer = tracing_init_dentry_tr(tr);  	if (!d_tracer)  		return NULL; -	t_options = debugfs_create_dir("options", d_tracer); -	if (!t_options) { +	tr->options = debugfs_create_dir("options", d_tracer); +	if (!tr->options) {  		pr_warning("Could not create debugfs directory 'options'\n");  		return NULL;  	} -	return t_options; +	return tr->options;  }  static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, +			 struct trace_option_dentry *topt,  			 struct tracer_flags *flags,  			 struct tracer_opt *opt)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	topt->flags = flags;  	topt->opt = opt; +	topt->tr = tr;  	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,  				    &trace_options_fops); @@ -4201,7 +6068,7 @@ create_trace_option_file(struct trace_option_dentry *topt,  }  static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer)  {  	struct trace_option_dentry *topts;  	struct tracer_flags *flags; @@ -4226,7 +6093,7 @@ create_trace_option_files(struct tracer *tracer)  		return NULL;  	for (cnt = 0; opts[cnt].name; cnt++) -		create_trace_option_file(&topts[cnt], flags, +		create_trace_option_file(tr, &topts[cnt], flags,  					 &opts[cnt]);  	return topts; @@ -4249,11 +6116,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)  }  static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, +			      const char *option, long index)  {  	struct dentry *t_options; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return NULL; @@ -4261,81 +6129,429 @@ create_trace_option_core_file(const char *option, long index)  				    &trace_options_core_fops);  } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr)  {  	struct dentry *t_options;  	int i; -	t_options = trace_options_init_dentry(); +	t_options = trace_options_init_dentry(tr);  	if (!t_options)  		return;  	for (i = 0; trace_options[i]; i++) -		create_trace_option_core_file(trace_options[i], i); +		create_trace_option_core_file(tr, trace_options[i], i);  } -static __init int tracer_init_debugfs(void) +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, +	       size_t cnt, loff_t *ppos)  { -	struct dentry *d_tracer; -	int cpu; +	struct trace_array *tr = filp->private_data; +	char buf[64]; +	int r; -	trace_access_lock_init(); +	r = tracer_tracing_is_on(tr); +	r = sprintf(buf, "%d\n", r); -	d_tracer = tracing_init_dentry(); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} -	trace_create_file("tracing_enabled", 0644, d_tracer, -			&global_trace, &tracing_ctrl_fops); +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, +		size_t cnt, loff_t *ppos) +{ +	struct trace_array *tr = filp->private_data; +	struct ring_buffer *buffer = tr->trace_buffer.buffer; +	unsigned long val; +	int ret; -	trace_create_file("trace_options", 0644, d_tracer, -			NULL, &tracing_iter_fops); +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; -	trace_create_file("tracing_cpumask", 0644, d_tracer, -			NULL, &tracing_cpumask_fops); +	if (buffer) { +		mutex_lock(&trace_types_lock); +		if (val) { +			tracer_tracing_on(tr); +			if (tr->current_trace->start) +				tr->current_trace->start(tr); +		} else { +			tracer_tracing_off(tr); +			if (tr->current_trace->stop) +				tr->current_trace->stop(tr); +		} +		mutex_unlock(&trace_types_lock); +	} -	trace_create_file("trace", 0644, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops); +	(*ppos)++; + +	return cnt; +} + +static const struct file_operations rb_simple_fops = { +	.open		= tracing_open_generic_tr, +	.read		= rb_simple_read, +	.write		= rb_simple_write, +	.release	= tracing_release_generic_tr, +	.llseek		= default_llseek, +}; + +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ +	enum ring_buffer_flags rb_flags; + +	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + +	buf->tr = tr; + +	buf->buffer = ring_buffer_alloc(size, rb_flags); +	if (!buf->buffer) +		return -ENOMEM; + +	buf->data = alloc_percpu(struct trace_array_cpu); +	if (!buf->data) { +		ring_buffer_free(buf->buffer); +		return -ENOMEM; +	} + +	/* Allocate the first page for all buffers */ +	set_buffer_entries(&tr->trace_buffer, +			   ring_buffer_size(tr->trace_buffer.buffer, 0)); + +	return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ +	int ret; + +	ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); +	if (ret) +		return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE +	ret = allocate_trace_buffer(tr, &tr->max_buffer, +				    allocate_snapshot ? size : 1); +	if (WARN_ON(ret)) { +		ring_buffer_free(tr->trace_buffer.buffer); +		free_percpu(tr->trace_buffer.data); +		return -ENOMEM; +	} +	tr->allocated_snapshot = allocate_snapshot; + +	/* +	 * Only the top level trace array gets its snapshot allocated +	 * from the kernel command line. +	 */ +	allocate_snapshot = false; +#endif +	return 0; +} + +static void free_trace_buffer(struct trace_buffer *buf) +{ +	if (buf->buffer) { +		ring_buffer_free(buf->buffer); +		buf->buffer = NULL; +		free_percpu(buf->data); +		buf->data = NULL; +	} +} + +static void free_trace_buffers(struct trace_array *tr) +{ +	if (!tr) +		return; + +	free_trace_buffer(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE +	free_trace_buffer(&tr->max_buffer); +#endif +} + +static int new_instance_create(const char *name) +{ +	struct trace_array *tr; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -EEXIST; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) +			goto out_unlock; +	} + +	ret = -ENOMEM; +	tr = kzalloc(sizeof(*tr), GFP_KERNEL); +	if (!tr) +		goto out_unlock; + +	tr->name = kstrdup(name, GFP_KERNEL); +	if (!tr->name) +		goto out_free_tr; + +	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) +		goto out_free_tr; + +	cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + +	raw_spin_lock_init(&tr->start_lock); + +	tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +	tr->current_trace = &nop_trace; + +	INIT_LIST_HEAD(&tr->systems); +	INIT_LIST_HEAD(&tr->events); + +	if (allocate_trace_buffers(tr, trace_buf_size) < 0) +		goto out_free_tr; + +	tr->dir = debugfs_create_dir(name, trace_instance_dir); +	if (!tr->dir) +		goto out_free_tr; + +	ret = event_trace_add_tracer(tr->dir, tr); +	if (ret) { +		debugfs_remove_recursive(tr->dir); +		goto out_free_tr; +	} + +	init_tracer_debugfs(tr, tr->dir); + +	list_add(&tr->list, &ftrace_trace_arrays); + +	mutex_unlock(&trace_types_lock); + +	return 0; + + out_free_tr: +	free_trace_buffers(tr); +	free_cpumask_var(tr->tracing_cpumask); +	kfree(tr->name); +	kfree(tr); + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; + +} + +static int instance_delete(const char *name) +{ +	struct trace_array *tr; +	int found = 0; +	int ret; + +	mutex_lock(&trace_types_lock); + +	ret = -ENODEV; +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr->name && strcmp(tr->name, name) == 0) { +			found = 1; +			break; +		} +	} +	if (!found) +		goto out_unlock; + +	ret = -EBUSY; +	if (tr->ref) +		goto out_unlock; + +	list_del(&tr->list); + +	tracing_set_nop(tr); +	event_trace_del_tracer(tr); +	ftrace_destroy_function_files(tr); +	debugfs_remove_recursive(tr->dir); +	free_trace_buffers(tr); + +	kfree(tr->name); +	kfree(tr); + +	ret = 0; + + out_unlock: +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the new_instance_create() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = new_instance_create(dentry->d_iname); + +	mutex_lock(&inode->i_mutex); + +	return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ +	struct dentry *parent; +	int ret; + +	/* Paranoid: Make sure the parent is the "instances" directory */ +	parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); +	if (WARN_ON_ONCE(parent != trace_instance_dir)) +		return -ENOENT; + +	/* The caller did a dget() on dentry */ +	mutex_unlock(&dentry->d_inode->i_mutex); + +	/* +	 * The inode mutex is locked, but debugfs_create_dir() will also +	 * take the mutex. As the instances directory can not be destroyed +	 * or changed in any other way, it is safe to unlock it, and +	 * let the dentry try. If two users try to make the same dir at +	 * the same time, then the instance_delete() will determine the +	 * winner. +	 */ +	mutex_unlock(&inode->i_mutex); + +	ret = instance_delete(dentry->d_iname); + +	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); +	mutex_lock(&dentry->d_inode->i_mutex); + +	return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { +	.lookup		= simple_lookup, +	.mkdir		= instance_mkdir, +	.rmdir		= instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ +	trace_instance_dir = debugfs_create_dir("instances", d_tracer); +	if (WARN_ON(!trace_instance_dir)) +		return; + +	/* Hijack the dir inode operations, to allow mkdir */ +	trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ +	int cpu;  	trace_create_file("available_tracers", 0444, d_tracer, -			&global_trace, &show_traces_fops); +			tr, &show_traces_fops);  	trace_create_file("current_tracer", 0644, d_tracer, -			&global_trace, &set_tracer_fops); +			tr, &set_tracer_fops); + +	trace_create_file("tracing_cpumask", 0644, d_tracer, +			  tr, &tracing_cpumask_fops); + +	trace_create_file("trace_options", 0644, d_tracer, +			  tr, &tracing_iter_fops); + +	trace_create_file("trace", 0644, d_tracer, +			  tr, &tracing_fops); + +	trace_create_file("trace_pipe", 0444, d_tracer, +			  tr, &tracing_pipe_fops); + +	trace_create_file("buffer_size_kb", 0644, d_tracer, +			  tr, &tracing_entries_fops); + +	trace_create_file("buffer_total_size_kb", 0444, d_tracer, +			  tr, &tracing_total_entries_fops); + +	trace_create_file("free_buffer", 0200, d_tracer, +			  tr, &tracing_free_buffer_fops); + +	trace_create_file("trace_marker", 0220, d_tracer, +			  tr, &tracing_mark_fops); + +	trace_create_file("trace_clock", 0644, d_tracer, tr, +			  &trace_clock_fops); + +	trace_create_file("tracing_on", 0644, d_tracer, +			  tr, &rb_simple_fops);  #ifdef CONFIG_TRACER_MAX_TRACE  	trace_create_file("tracing_max_latency", 0644, d_tracer, -			&tracing_max_latency, &tracing_max_lat_fops); +			&tr->max_latency, &tracing_max_lat_fops); +#endif + +	if (ftrace_create_function_files(tr, d_tracer)) +		WARN(1, "Could not allocate function filter files"); + +#ifdef CONFIG_TRACER_SNAPSHOT +	trace_create_file("snapshot", 0644, d_tracer, +			  tr, &snapshot_fops);  #endif +	for_each_tracing_cpu(cpu) +		tracing_init_debugfs_percpu(tr, cpu); + +} + +static __init int tracer_init_debugfs(void) +{ +	struct dentry *d_tracer; + +	trace_access_lock_init(); + +	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; + +	init_tracer_debugfs(&global_trace, d_tracer); +  	trace_create_file("tracing_thresh", 0644, d_tracer,  			&tracing_thresh, &tracing_max_lat_fops);  	trace_create_file("README", 0444, d_tracer,  			NULL, &tracing_readme_fops); -	trace_create_file("trace_pipe", 0444, d_tracer, -			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - -	trace_create_file("buffer_size_kb", 0644, d_tracer, -			&global_trace, &tracing_entries_fops); - -	trace_create_file("trace_marker", 0220, d_tracer, -			NULL, &tracing_mark_fops); -  	trace_create_file("saved_cmdlines", 0444, d_tracer,  			NULL, &tracing_saved_cmdlines_fops); -	trace_create_file("trace_clock", 0644, d_tracer, NULL, -			  &trace_clock_fops); +	trace_create_file("saved_cmdlines_size", 0644, d_tracer, +			  NULL, &tracing_saved_cmdlines_size_fops);  #ifdef CONFIG_DYNAMIC_FTRACE  	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,  			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);  #endif -	create_trace_options_dir(); +	create_trace_instances(d_tracer); -	for_each_tracing_cpu(cpu) -		tracing_init_debugfs_percpu(cpu); +	create_trace_options_dir(&global_trace);  	return 0;  } @@ -4391,8 +6607,8 @@ void  trace_printk_seq(struct trace_seq *s)  {  	/* Probably should print a warning here. */ -	if (s->len >= 1000) -		s->len = 1000; +	if (s->len >= TRACE_MAX_PRINT) +		s->len = TRACE_MAX_PRINT;  	/* should be zero ended, but we are paranoid. */  	s->buffer[s->len] = 0; @@ -4405,39 +6621,54 @@ trace_printk_seq(struct trace_seq *s)  void trace_init_global_iter(struct trace_iterator *iter)  {  	iter->tr = &global_trace; -	iter->trace = current_trace; -	iter->cpu_file = TRACE_PIPE_ALL_CPU; +	iter->trace = iter->tr->current_trace; +	iter->cpu_file = RING_BUFFER_ALL_CPUS; +	iter->trace_buffer = &global_trace.trace_buffer; + +	if (iter->trace && iter->trace->open) +		iter->trace->open(iter); + +	/* Annotate start of buffers if we had overruns */ +	if (ring_buffer_overruns(iter->trace_buffer->buffer)) +		iter->iter_flags |= TRACE_FILE_ANNOTATE; + +	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ +	if (trace_clocks[iter->tr->clock_id].in_ns) +		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;  } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)  { -	static arch_spinlock_t ftrace_dump_lock = -		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  	/* use static because iter can be a bit big for the stack */  	static struct trace_iterator iter; +	static atomic_t dump_running;  	unsigned int old_userobj; -	static int dump_ran;  	unsigned long flags;  	int cnt = 0, cpu; -	/* only one dump */ -	local_irq_save(flags); -	arch_spin_lock(&ftrace_dump_lock); -	if (dump_ran) -		goto out; - -	dump_ran = 1; +	/* Only allow one dump user at a time. */ +	if (atomic_inc_return(&dump_running) != 1) { +		atomic_dec(&dump_running); +		return; +	} +	/* +	 * Always turn off tracing when we dump. +	 * We don't need to show trace output of what happens +	 * between multiple crashes. +	 * +	 * If the user does a sysrq-z, then they can re-enable +	 * tracing with echo 1 > tracing_on. +	 */  	tracing_off(); -	if (disable_tracing) -		ftrace_kill(); +	local_irq_save(flags); +	/* Simulate the iterator */  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);  	}  	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4445,13 +6676,9 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  	/* don't look at user memory in panic mode */  	trace_flags &= ~TRACE_ITER_SYM_USEROBJ; -	/* Simulate the iterator */ -	iter.tr = &global_trace; -	iter.trace = current_trace; -  	switch (oops_dump_mode) {  	case DUMP_ALL: -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  		break;  	case DUMP_ORIG:  		iter.cpu_file = raw_smp_processor_id(); @@ -4460,11 +6687,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		goto out_enable;  	default:  		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); -		iter.cpu_file = TRACE_PIPE_ALL_CPU; +		iter.cpu_file = RING_BUFFER_ALL_CPUS;  	}  	printk(KERN_TRACE "Dumping ftrace buffer:\n"); +	/* Did function tracer already get disabled? */ +	if (ftrace_is_dead()) { +		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); +		printk("#          MAY BE MISSING FUNCTION EVENTS\n"); +	} +  	/*  	 * We need to stop all tracing on all CPUS to read the  	 * the next buffer. This is a bit expensive, but is @@ -4493,6 +6726,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  			if (ret != TRACE_TYPE_NO_CONSUME)  				trace_consume(&iter);  		} +		touch_nmi_watchdog();  		trace_printk_seq(&iter.seq);  	} @@ -4503,39 +6737,33 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)  		printk(KERN_TRACE "---------------------------------\n");   out_enable: -	/* Re-enable tracing if requested */ -	if (!disable_tracing) { -		trace_flags |= old_userobj; +	trace_flags |= old_userobj; -		for_each_tracing_cpu(cpu) { -			atomic_dec(&iter.tr->data[cpu]->disabled); -		} -		tracing_on(); +	for_each_tracing_cpu(cpu) { +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	} - - out: -	arch_spin_unlock(&ftrace_dump_lock); + 	atomic_dec(&dump_running);  	local_irq_restore(flags);  } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ -	__ftrace_dump(true, oops_dump_mode); -} +EXPORT_SYMBOL_GPL(ftrace_dump);  __init static int tracer_alloc_buffers(void)  {  	int ring_buf_size; -	int i;  	int ret = -ENOMEM; +  	if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))  		goto out; -	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) +	if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))  		goto out_free_buffer_mask; +	/* Only allocate trace_printk buffers if a trace_printk exists */ +	if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) +		/* Must be called before global_trace.buffer is allocated */ +		trace_printk_init_buffers(); +  	/* To save memory, keep the ring buffer size to its minimum */  	if (ring_buffer_expanded)  		ring_buf_size = trace_buf_size; @@ -4543,40 +6771,48 @@ __init static int tracer_alloc_buffers(void)  		ring_buf_size = 1;  	cpumask_copy(tracing_buffer_mask, cpu_possible_mask); -	cpumask_copy(tracing_cpumask, cpu_all_mask); +	cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); + +	raw_spin_lock_init(&global_trace.start_lock); + +	/* Used for event triggers */ +	temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); +	if (!temp_buffer) +		goto out_free_cpumask; + +	if (trace_create_savedcmd() < 0) +		goto out_free_temp_buffer;  	/* TODO: make the number of buffers hot pluggable with CPUS */ -	global_trace.buffer = ring_buffer_alloc(ring_buf_size, -						   TRACE_BUFFER_FLAGS); -	if (!global_trace.buffer) { +	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {  		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");  		WARN_ON(1); -		goto out_free_cpumask; +		goto out_free_savedcmd;  	} -	global_trace.entries = ring_buffer_size(global_trace.buffer); +	if (global_trace.buffer_disabled) +		tracing_off(); -#ifdef CONFIG_TRACER_MAX_TRACE -	max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); -	if (!max_tr.buffer) { -		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); -		WARN_ON(1); -		ring_buffer_free(global_trace.buffer); -		goto out_free_cpumask; +	if (trace_boot_clock) { +		ret = tracing_set_clock(&global_trace, trace_boot_clock); +		if (ret < 0) +			pr_warning("Trace clock %s not defined, going back to default\n", +				   trace_boot_clock);  	} -	max_tr.entries = 1; -#endif -	/* Allocate the first page for all buffers */ -	for_each_tracing_cpu(i) { -		global_trace.data[i] = &per_cpu(global_trace_cpu, i); -		max_tr.data[i] = &per_cpu(max_tr_data, i); -	} +	/* +	 * register_tracer() might reference current_trace, so it +	 * needs to be set before we register anything. This is +	 * just a bootstrap of current_trace anyway. +	 */ +	global_trace.current_trace = &nop_trace; + +	global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -	trace_init_cmdlines(); +	ftrace_init_global_array_ops(&global_trace);  	register_tracer(&nop_trace); -	current_trace = &nop_trace; +  	/* All seems OK, enable tracing */  	tracing_disabled = 0; @@ -4585,10 +6821,29 @@ __init static int tracer_alloc_buffers(void)  	register_die_notifier(&trace_die_notifier); +	global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + +	INIT_LIST_HEAD(&global_trace.systems); +	INIT_LIST_HEAD(&global_trace.events); +	list_add(&global_trace.list, &ftrace_trace_arrays); + +	while (trace_boot_options) { +		char *option; + +		option = strsep(&trace_boot_options, ","); +		trace_set_options(&global_trace, option); +	} + +	register_snapshot_cmd(); +  	return 0; +out_free_savedcmd: +	free_saved_cmdlines_buffer(savedcmd); +out_free_temp_buffer: +	ring_buffer_free(temp_buffer);  out_free_cpumask: -	free_cpumask_var(tracing_cpumask); +	free_cpumask_var(global_trace.tracing_cpumask);  out_free_buffer_mask:  	free_cpumask_var(tracing_buffer_mask);  out: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c..9258f5a815d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,8 +1,9 @@ +  #ifndef _LINUX_KERNEL_TRACE_H  #define _LINUX_KERNEL_TRACE_H  #include <linux/fs.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <linux/sched.h>  #include <linux/clocksource.h>  #include <linux/ring_buffer.h> @@ -12,6 +13,12 @@  #include <linux/hw_breakpoint.h>  #include <linux/trace_seq.h>  #include <linux/ftrace_event.h> +#include <linux/compiler.h> + +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h>		/* For NR_SYSCALLS	     */ +#include <asm/syscall.h>	/* some archs define it here */ +#endif  enum trace_type {  	__TRACE_FIRST_TYPE = 0, @@ -29,6 +36,7 @@ enum trace_type {  	TRACE_GRAPH_ENT,  	TRACE_USER_STACK,  	TRACE_BLK, +	TRACE_BPUTS,  	__TRACE_LAST_TYPE,  }; @@ -56,17 +64,23 @@ enum trace_type {  #define F_STRUCT(args...)		args  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\ -	struct struct_name {					\ -		struct trace_entry	ent;			\ -		tstruct						\ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ +	struct struct_name {						\ +		struct trace_entry	ent;				\ +		tstruct							\  	}  #undef TP_ARGS  #define TP_ARGS(args...)	args  #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print,	\ +			 filter, regfn) \ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter)  #include "trace_entries.h" @@ -112,10 +126,13 @@ enum trace_flag_type {  	TRACE_FLAG_NEED_RESCHED		= 0x04,  	TRACE_FLAG_HARDIRQ		= 0x08,  	TRACE_FLAG_SOFTIRQ		= 0x10, +	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,  };  #define TRACE_BUF_SIZE		1024 +struct trace_array; +  /*   * The CPU trace array - it consists of thousands of trace entries   * plus some other descriptor data: (for example which task started @@ -125,6 +142,7 @@ struct trace_array_cpu {  	atomic_t		disabled;  	void			*buffer_page;	/* ring buffer spare */ +	unsigned long		entries;  	unsigned long		saved_latency;  	unsigned long		critical_start;  	unsigned long		critical_end; @@ -135,24 +153,114 @@ struct trace_array_cpu {  	unsigned long		skipped_entries;  	cycle_t			preempt_timestamp;  	pid_t			pid; -	uid_t			uid; +	kuid_t			uid;  	char			comm[TASK_COMM_LEN];  }; +struct tracer; + +struct trace_buffer { +	struct trace_array		*tr; +	struct ring_buffer		*buffer; +	struct trace_array_cpu __percpu	*data; +	cycle_t				time_start; +	int				cpu; +}; +  /*   * The trace array - an array of per-CPU trace arrays. This is the   * highest level data structure that individual tracers deal with.   * They have on/off state as well:   */  struct trace_array { -	struct ring_buffer	*buffer; -	unsigned long		entries; -	int			cpu; -	cycle_t			time_start; -	struct task_struct	*waiter; -	struct trace_array_cpu	*data[NR_CPUS]; +	struct list_head	list; +	char			*name; +	struct trace_buffer	trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE +	/* +	 * The max_buffer is used to snapshot the trace when a maximum +	 * latency is reached, or when the user initiates a snapshot. +	 * Some tracers will use this to store a maximum trace while +	 * it continues examining live traces. +	 * +	 * The buffers for the max_buffer are set up the same as the trace_buffer +	 * When a snapshot is taken, the buffer of the max_buffer is swapped +	 * with the buffer of the trace_buffer and the buffers are reset for +	 * the trace_buffer so the tracing can continue. +	 */ +	struct trace_buffer	max_buffer; +	bool			allocated_snapshot; +	unsigned long		max_latency; +#endif +	/* +	 * max_lock is used to protect the swapping of buffers +	 * when taking a max snapshot. The buffers themselves are +	 * protected by per_cpu spinlocks. But the action of the swap +	 * needs its own lock. +	 * +	 * This is defined as a arch_spinlock_t in order to help +	 * with performance when lockdep debugging is enabled. +	 * +	 * It is also used in other places outside the update_max_tr +	 * so it needs to be defined outside of the +	 * CONFIG_TRACER_MAX_TRACE. +	 */ +	arch_spinlock_t		max_lock; +	int			buffer_disabled; +#ifdef CONFIG_FTRACE_SYSCALLS +	int			sys_refcount_enter; +	int			sys_refcount_exit; +	struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; +	struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; +#endif +	int			stop_count; +	int			clock_id; +	struct tracer		*current_trace; +	unsigned int		flags; +	raw_spinlock_t		start_lock; +	struct dentry		*dir; +	struct dentry		*options; +	struct dentry		*percpu_dir; +	struct dentry		*event_dir; +	struct list_head	systems; +	struct list_head	events; +	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */ +	int			ref; +#ifdef CONFIG_FUNCTION_TRACER +	struct ftrace_ops	*ops; +	/* function tracing enabled */ +	int			function_enabled; +#endif  }; +enum { +	TRACE_ARRAY_FL_GLOBAL	= (1 << 0) +}; + +extern struct list_head ftrace_trace_arrays; + +extern struct mutex trace_types_lock; + +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ +	struct trace_array *tr; + +	if (list_empty(&ftrace_trace_arrays)) +		return NULL; + +	tr = list_entry(ftrace_trace_arrays.prev, +			typeof(*tr), list); +	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); +	return tr; +} +  #define FTRACE_CMP_TYPE(var, type) \  	__builtin_types_compatible_p(typeof(var), type *) @@ -188,6 +296,7 @@ extern void __ftrace_bad_type(void);  		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\  		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\  		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\ +		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\  			  TRACE_MMIO_RW);				\  		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\ @@ -232,7 +341,6 @@ struct tracer_flags {   * @stop: called when tracing is paused (echo 0 > tracing_enabled)   * @open: called when the trace file is opened   * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe   * @close: called when the trace file is released   * @pipe_close: called when the trace_pipe file is released   * @read: override the default read callback on trace_pipe @@ -251,7 +359,6 @@ struct tracer {  	void			(*stop)(struct trace_array *tr);  	void			(*open)(struct trace_iterator *iter);  	void			(*pipe_open)(struct trace_iterator *iter); -	void			(*wait_pipe)(struct trace_iterator *iter);  	void			(*close)(struct trace_iterator *iter);  	void			(*pipe_close)(struct trace_iterator *iter);  	ssize_t			(*read)(struct trace_iterator *iter, @@ -270,30 +377,168 @@ struct tracer {  	void			(*print_header)(struct seq_file *m);  	enum print_line_t	(*print_line)(struct trace_iterator *iter);  	/* If you handled the flag setting, return 0 */ -	int			(*set_flag)(u32 old_flags, u32 bit, int set); +	int			(*set_flag)(struct trace_array *tr, +					    u32 old_flags, u32 bit, int set); +	/* Return 0 if OK with change, else return non-zero */ +	int			(*flag_changed)(struct trace_array *tr, +						u32 mask, int set);  	struct tracer		*next; -	int			print_max;  	struct tracer_flags	*flags; -	int			use_max_tr; +	int			enabled; +	bool			print_max; +	bool			allow_instances; +#ifdef CONFIG_TRACER_MAX_TRACE +	bool			use_max_tr; +#endif +}; + + +/* Only current can touch trace_recursion */ + +/* + * For function tracing recursion: + *  The order of these bits are important. + * + *  When function tracing occurs, the following steps are made: + *   If arch does not support a ftrace feature: + *    call internal function (uses INTERNAL bits) which calls... + *   If callback is registered to the "global" list, the list + *    function is called and recursion checks the GLOBAL bits. + *    then this function calls... + *   The function callback, which can use the FTRACE bits to + *    check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { +	TRACE_BUFFER_BIT, +	TRACE_BUFFER_NMI_BIT, +	TRACE_BUFFER_IRQ_BIT, +	TRACE_BUFFER_SIRQ_BIT, + +	/* Start of function recursion bits */ +	TRACE_FTRACE_BIT, +	TRACE_FTRACE_NMI_BIT, +	TRACE_FTRACE_IRQ_BIT, +	TRACE_FTRACE_SIRQ_BIT, + +	/* INTERNAL_BITs must be greater than FTRACE_BITs */ +	TRACE_INTERNAL_BIT, +	TRACE_INTERNAL_NMI_BIT, +	TRACE_INTERNAL_IRQ_BIT, +	TRACE_INTERNAL_SIRQ_BIT, + +	TRACE_CONTROL_BIT, + +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ +	TRACE_IRQ_BIT,  }; +#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit))) + +#define TRACE_CONTEXT_BITS	4 + +#define TRACE_FTRACE_START	TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START	TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ +	int bit; + +	if (in_interrupt()) { +		if (in_nmi()) +			bit = 0; + +		else if (in_irq()) +			bit = 1; +		else +			bit = 2; +	} else +		bit = 3; + +	return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ +	unsigned int val = current->trace_recursion; +	int bit; + +	/* A previous recursion check was made */ +	if ((val & TRACE_CONTEXT_MASK) > max) +		return 0; + +	bit = trace_get_context_bit() + start; +	if (unlikely(val & (1 << bit))) +		return -1; + +	val |= 1 << bit; +	current->trace_recursion = val; +	barrier(); + +	return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ +	unsigned int val = current->trace_recursion; + +	if (!bit) +		return; + +	bit = 1 << bit; +	val &= ~bit; -#define TRACE_PIPE_ALL_CPU	-1 +	barrier(); +	current->trace_recursion = val; +} + +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ +	if (iter->buffer_iter && iter->buffer_iter[cpu]) +		return iter->buffer_iter[cpu]; +	return NULL; +}  int tracer_init(struct tracer *t, struct trace_array *tr);  int tracing_is_enabled(void); -void trace_wake_up(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf);  void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void);  struct dentry *trace_create_file(const char *name, -				 mode_t mode, +				 umode_t mode,  				 struct dentry *parent,  				 void *data,  				 const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr);  struct dentry *tracing_init_dentry(void);  struct ring_buffer_event; @@ -304,9 +549,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,  			  unsigned long len,  			  unsigned long flags,  			  int pc); -void trace_buffer_unlock_commit(struct ring_buffer *buffer, -				struct ring_buffer_event *event, -				unsigned long flags, int pc);  struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,  						struct trace_array_cpu *data); @@ -314,6 +556,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,  struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,  					  int *ent_cpu, u64 *ent_ts); +void __buffer_unlock_commit(struct ring_buffer *buffer, +			    struct ring_buffer_event *event); +  int trace_empty(struct trace_iterator *iter);  void *trace_find_next_entry_inc(struct trace_iterator *iter); @@ -322,14 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);  void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void default_wait_pipe(struct trace_iterator *iter); -void poll_wait_pipe(struct trace_iterator *iter); - -void ftrace(struct trace_array *tr, -			    struct trace_array_cpu *data, -			    unsigned long ip, -			    unsigned long parent_ip, -			    unsigned long flags, int pc);  void tracing_sched_switch_trace(struct trace_array *tr,  				struct task_struct *prev,  				struct task_struct *next, @@ -347,6 +584,7 @@ void trace_graph_function(struct trace_array *tr,  		    unsigned long ip,  		    unsigned long parent_ip,  		    unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m);  void trace_default_header(struct seq_file *m);  void print_trace_header(struct seq_file *m, struct trace_iterator *iter);  int trace_empty(struct trace_iterator *iter); @@ -361,12 +599,9 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);  void tracing_stop_sched_switch_record(void);  void tracing_start_sched_switch_record(void);  int register_tracer(struct tracer *type); -void unregister_tracer(struct tracer *type);  int is_tracing_stopped(void); -enum trace_file_type { -	TRACE_FILE_LAT_FMT	= 1, -	TRACE_FILE_ANNOTATE	= 2, -}; + +loff_t tracing_lseek(struct file *file, loff_t offset, int whence);  extern cpumask_var_t __read_mostly tracing_buffer_mask; @@ -378,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);  extern unsigned long tracing_thresh;  #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; -  void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);  void update_max_tr_single(struct trace_array *tr,  			  struct task_struct *tsk, int cpu); @@ -389,6 +622,9 @@ void update_max_tr_single(struct trace_array *tr,  void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,  			int skip, int pc); +void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, +			     int skip, int pc, struct pt_regs *regs); +  void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,  			    int pc); @@ -400,6 +636,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,  {  } +static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, +					   unsigned long flags, int skip, +					   int pc, struct pt_regs *regs) +{ +} +  static inline void ftrace_trace_userstack(struct ring_buffer *buffer,  					  unsigned long flags, int pc)  { @@ -417,11 +659,13 @@ extern void trace_find_cmdline(int pid, char comm[]);  #ifdef CONFIG_DYNAMIC_FTRACE  extern unsigned long ftrace_update_tot_cnt; +#endif  #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func  extern int DYN_FTRACE_TEST_NAME(void); -#endif +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded;  extern bool tracing_selftest_disabled;  DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -444,6 +688,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,  					       struct trace_array *tr);  extern int trace_selftest_startup_branch(struct tracer *trace,  					 struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data		__refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data		__read_mostly  #endif /* CONFIG_FTRACE_STARTUP_TEST */  extern void *head_page(struct trace_array_cpu *data); @@ -457,13 +710,13 @@ trace_array_vprintk(struct trace_array *tr,  		    unsigned long ip, const char *fmt, va_list args);  int trace_array_printk(struct trace_array *tr,  		       unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, +			   unsigned long ip, const char *fmt, ...);  void trace_printk_seq(struct trace_seq *s);  enum print_line_t print_trace_line(struct trace_iterator *iter);  extern unsigned long trace_flags; -extern int trace_clock_id; -  /* Standard output formatting function used for function return traces */  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -474,6 +727,10 @@ extern int trace_clock_id;  #define TRACE_GRAPH_PRINT_PROC          0x8  #define TRACE_GRAPH_PRINT_DURATION      0x10  #define TRACE_GRAPH_PRINT_ABS_TIME      0x20 +#define TRACE_GRAPH_PRINT_IRQS          0x40 +#define TRACE_GRAPH_PRINT_TAIL          0x80 +#define TRACE_GRAPH_PRINT_FILL_SHIFT	28 +#define TRACE_GRAPH_PRINT_FILL_MASK	(0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)  extern enum print_line_t  print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -493,19 +750,45 @@ extern void __trace_graph_return(struct trace_array *tr,  #ifdef CONFIG_DYNAMIC_FTRACE  /* TODO: make this variable */  #define FTRACE_GRAPH_MAX_FUNCS		32 -extern int ftrace_graph_filter_enabled;  extern int ftrace_graph_count;  extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];  static inline int ftrace_graph_addr(unsigned long addr)  {  	int i; -	if (!ftrace_graph_filter_enabled) +	if (!ftrace_graph_count)  		return 1;  	for (i = 0; i < ftrace_graph_count; i++) { -		if (addr == ftrace_graph_funcs[i]) +		if (addr == ftrace_graph_funcs[i]) { +			/* +			 * If no irqs are to be traced, but a set_graph_function +			 * is set, and called by an interrupt handler, we still +			 * want to trace it. +			 */ +			if (in_irq()) +				trace_recursion_set(TRACE_IRQ_BIT); +			else +				trace_recursion_clear(TRACE_IRQ_BIT); +			return 1; +		} +	} + +	return 0; +} + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ +	int i; + +	if (!ftrace_graph_notrace_count) +		return 0; + +	for (i = 0; i < ftrace_graph_notrace_count; i++) { +		if (addr == ftrace_graph_notrace_funcs[i])  			return 1;  	} @@ -516,6 +799,11 @@ static inline int ftrace_graph_addr(unsigned long addr)  {  	return 1;  } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ +	return 0; +}  #endif /* CONFIG_DYNAMIC_FTRACE */  #else /* CONFIG_FUNCTION_GRAPH_TRACER */  static inline enum print_line_t @@ -528,6 +816,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)  extern struct list_head ftrace_pids;  #ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata;  static inline int ftrace_trace_task(struct task_struct *task)  {  	if (list_empty(&ftrace_pids)) @@ -535,12 +824,48 @@ static inline int ftrace_trace_task(struct task_struct *task)  	return test_tsk_trace_trace(task);  } +extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, +				 struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void);  #else  static inline int ftrace_trace_task(struct task_struct *task)  {  	return 1;  } -#endif +static inline int ftrace_is_dead(void) { return 0; } +static inline int +ftrace_create_function_files(struct trace_array *tr, +			     struct dentry *parent) +{ +	return 0; +} +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +void ftrace_create_filter_files(struct ftrace_ops *ops, +				struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); +#else +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ + +int ftrace_event_is_function(struct ftrace_event_call *call);  /*   * struct trace_parser - servers for reading the user input separated by spaces @@ -606,6 +931,11 @@ enum trace_iterator_flags {  	TRACE_ITER_SLEEP_TIME		= 0x40000,  	TRACE_ITER_GRAPH_TIME		= 0x80000,  	TRACE_ITER_RECORD_CMD		= 0x100000, +	TRACE_ITER_OVERWRITE		= 0x200000, +	TRACE_ITER_STOP_ON_FREE		= 0x400000, +	TRACE_ITER_IRQ_INFO		= 0x800000, +	TRACE_ITER_MARKERS		= 0x1000000, +	TRACE_ITER_FUNCTION		= 0x2000000,  };  /* @@ -644,16 +974,10 @@ static inline void trace_branch_disable(void)  /* set ring buffers to default size if not already done so */  int tracing_update_buffers(void); -/* trace event type bit fields, not numeric */ -enum { -	TRACE_EVENT_TYPE_PRINTF		= 1, -	TRACE_EVENT_TYPE_RAW		= 2, -}; -  struct ftrace_event_field {  	struct list_head	link; -	char			*name; -	char			*type; +	const char		*name; +	const char		*type;  	int			filter_type;  	int			offset;  	int			size; @@ -661,24 +985,46 @@ struct ftrace_event_field {  };  struct event_filter { -	int			n_preds; -	struct filter_pred	**preds; +	int			n_preds;	/* Number assigned */ +	int			a_preds;	/* allocated */ +	struct filter_pred	*preds; +	struct filter_pred	*root;  	char			*filter_string;  };  struct event_subsystem {  	struct list_head	list;  	const char		*name; -	struct dentry		*entry;  	struct event_filter	*filter; -	int			nr_events; +	int			ref_count; +}; + +struct ftrace_subsystem_dir { +	struct list_head		list; +	struct event_subsystem		*subsystem; +	struct trace_array		*tr; +	struct dentry			*entry; +	int				ref_count; +	int				nr_events;  }; +#define FILTER_PRED_INVALID	((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT	(1 << 15) +#define FILTER_PRED_FOLD	(1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED		16384 +  struct filter_pred;  struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, -				 int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);  typedef int (*regex_match_func)(char *str, struct regex *r, int len); @@ -700,59 +1046,266 @@ struct filter_pred {  	filter_pred_fn_t 	fn;  	u64 			val;  	struct regex		regex; -	char 			*field_name; +	unsigned short		*ops; +	struct ftrace_event_field *field;  	int 			offset;  	int 			not;  	int 			op; -	int 			pop_n; +	unsigned short		index; +	unsigned short		parent; +	unsigned short		left; +	unsigned short		right;  }; -extern struct list_head ftrace_common_fields; -  extern enum regex_type  filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_file *file,  			       struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_call *call, +extern int apply_event_filter(struct ftrace_event_file *file,  			      char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  					char *filter_string);  extern void print_subsystem_event_filter(struct event_subsystem *system,  					 struct trace_seq *s);  extern int filter_assign_type(const char *type); +extern int create_event_filter(struct ftrace_event_call *call, +			       char *filter_str, bool set_str, +			       struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name); -static inline int -filter_check_discard(struct ftrace_event_call *call, void *rec, -		     struct ring_buffer *buffer, -		     struct ring_buffer_event *event) -{ -	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && -	    !filter_match_preds(call->filter, rec)) { -		ring_buffer_discard_commit(buffer, event); -		return 1; -	} +extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); -	return 0; -} +extern struct ftrace_event_file *find_event_file(struct trace_array *tr, +						 const char *system, +						 const char *event); -extern void trace_event_enable_cmd_record(bool enable); +static inline void *event_file_data(struct file *filp) +{ +	return ACCESS_ONCE(file_inode(filp)->i_private); +}  extern struct mutex event_mutex;  extern struct list_head ftrace_events; +extern const struct file_operations event_trigger_fops; + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +struct event_trigger_data { +	unsigned long			count; +	int				ref; +	struct event_trigger_ops	*ops; +	struct event_command		*cmd_ops; +	struct event_filter __rcu	*filter; +	char				*filter_str; +	void				*private_data; +	struct list_head		list; +}; + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @func: The trigger 'probe' function called when the triggering + *	event occurs.  The data passed into this callback is the data + *	that was supplied to the event_command @reg() function that + *	registered the trigger (see struct event_command). + * + * @init: An optional initialization function called for the trigger + *	when the trigger is registered (via the event_command reg() + *	function).  This can be used to perform per-trigger + *	initialization such as incrementing a per-trigger reference + *	count, for instance.  This is usually implemented by the + *	generic utility function @event_trigger_init() (see + *	trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + *	trigger when the trigger is unregistered (via the + *	event_command @reg() function).  This can be used to perform + *	per-trigger de-initialization such as decrementing a + *	per-trigger reference count and freeing corresponding trigger + *	data, for instance.  This is usually implemented by the + *	generic utility function @event_trigger_free() (see + *	trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + *	itself.  This is usually implemented by a wrapper function + *	that calls the generic utility function @event_trigger_print() + *	(see trace_event_triggers.c). + */ +struct event_trigger_ops { +	void			(*func)(struct event_trigger_data *data); +	int			(*init)(struct event_trigger_ops *ops, +					struct event_trigger_data *data); +	void			(*free)(struct event_trigger_ops *ops, +					struct event_trigger_data *data); +	int			(*print)(struct seq_file *m, +					 struct event_trigger_ops *ops, +					 struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event.  The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event.  When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command.  This is + *	the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + *	'type'.  This value has two purposes, the first to ensure that + *	only one trigger of the same type can be set at a given time + *	for a particular event e.g. it doesn't make sense to have both + *	a traceon and traceoff trigger attached to a single event at + *	the same time, so traceon and traceoff have the same type + *	though they have different names.  The @trigger_type value is + *	also used as a bit value for deferring the actual trigger + *	action until after the current event is finished.  Some + *	commands need to do this if they themselves log to the trace + *	buffer (see the @post_trigger() member below).  @trigger_type + *	values are defined by adding new values to the trigger_type + *	enum in include/linux/ftrace_event.h. + * + * @post_trigger: A flag that says whether or not this command needs + *	to have its action delayed until after the current event has + *	been closed.  Some triggers need to avoid being invoked while + *	an event is currently in the process of being logged, since + *	the trigger may itself log data into the trace buffer.  Thus + *	we make sure the current event is committed before invoking + *	those triggers.  To do that, the trigger invocation is split + *	in two - the first part checks the filter using the current + *	trace record; if a command has the @post_trigger flag set, it + *	sets a bit for itself in the return value, otherwise it + *	directly invokes the trigger.  Once all commands have been + *	either invoked or set their return flag, the current record is + *	either committed or discarded.  At that point, if any commands + *	have deferred their triggers, those commands are finally + *	invoked following the close of the current event.  In other + *	words, if the event_trigger_ops @func() probe implementation + *	itself logs to the trace buffer, this flag should be set, + *	otherwise it can be left unspecified. + * + * All the methods below, except for @set_filter(), must be + * implemented. + * + * @func: The callback function responsible for parsing and + *	registering the trigger written to the 'trigger' file by the + *	user.  It allocates the trigger instance and registers it with + *	the appropriate trace event.  It makes use of the other + *	event_command callback functions to orchestrate this, and is + *	usually implemented by the generic utility function + *	@event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + *	event, and enables the event trigger itself, after + *	initializing it (via the event_trigger_ops @init() function). + *	This is also where commands can use the @trigger_type value to + *	make the decision as to whether or not multiple instances of + *	the trigger should be allowed.  This is usually implemented by + *	the generic utility function @register_trigger() (see + *	trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + *	with the event, and disables the event trigger itself, after + *	initializing it (via the event_trigger_ops @free() function). + *	This is usually implemented by the generic utility function + *	@unregister_trigger() (see trace_event_triggers.c). + * + * @set_filter: An optional function called to parse and set a filter + *	for the trigger.  If no @set_filter() method is set for the + *	event command, filters set by the user for the command will be + *	ignored.  This is usually implemented by the generic utility + *	function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + *	event_trigger_ops implementation associated with the command. + */ +struct event_command { +	struct list_head	list; +	char			*name; +	enum event_trigger_type	trigger_type; +	bool			post_trigger; +	int			(*func)(struct event_command *cmd_ops, +					struct ftrace_event_file *file, +					char *glob, char *cmd, char *params); +	int			(*reg)(char *glob, +				       struct event_trigger_ops *ops, +				       struct event_trigger_data *data, +				       struct ftrace_event_file *file); +	void			(*unreg)(char *glob, +					 struct event_trigger_ops *ops, +					 struct event_trigger_data *data, +					 struct ftrace_event_file *file); +	int			(*set_filter)(char *filter_str, +					      struct event_trigger_data *data, +					      struct ftrace_event_file *file); +	struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +extern int trace_event_enable_disable(struct ftrace_event_file *file, +				      int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); +  extern const char *__start___trace_bprintk_fmt[];  extern const char *__stop___trace_bprintk_fmt[]; +extern const char *__start___tracepoint_str[]; +extern const char *__stop___tracepoint_str[]; + +void trace_printk_init_buffers(void); +void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) +  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)		\ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\  	extern struct ftrace_event_call					\ -	__attribute__((__aligned__(4))) event_##call; +	__aligned(4) event_##call;  #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)		\ -	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\ +	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter)  #include "trace_entries.h" +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) +int perf_ftrace_event_register(struct ftrace_event_call *call, +			       enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif +  #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 00000000000..40a14cbcf8e --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,198 @@ +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ +	u64 start; +	u64 stop; +	u64 delta; +	u64 stddev; +	u64 seed; +	u64 last_seed; +	unsigned int avg; +	unsigned int std = 0; + +	/* Only run if the tracepoint is actually active */ +	if (!trace_benchmark_event_enabled()) +		return; + +	local_irq_disable(); +	start = trace_clock_local(); +	trace_benchmark_event(bm_str); +	stop = trace_clock_local(); +	local_irq_enable(); + +	bm_cnt++; + +	delta = stop - start; + +	/* +	 * The first read is cold cached, keep it separate from the +	 * other calculations. +	 */ +	if (bm_cnt == 1) { +		bm_first = delta; +		scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, +			  "first=%llu [COLD CACHED]", bm_first); +		return; +	} + +	bm_last = delta; + +	if (delta > bm_max) +		bm_max = delta; +	if (!bm_min || delta < bm_min) +		bm_min = delta; + +	/* +	 * When bm_cnt is greater than UINT_MAX, it breaks the statistics +	 * accounting. Freeze the statistics when that happens. +	 * We should have enough data for the avg and stddev anyway. +	 */ +	if (bm_cnt > UINT_MAX) { +		scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, +		    "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", +			  bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); +		return; +	} + +	bm_total += delta; +	bm_totalsq += delta * delta; + + +	if (bm_cnt > 1) { +		/* +		 * Apply Welford's method to calculate standard deviation: +		 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) +		 */ +		stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; +		do_div(stddev, (u32)bm_cnt); +		do_div(stddev, (u32)bm_cnt - 1); +	} else +		stddev = 0; + +	delta = bm_total; +	do_div(delta, bm_cnt); +	avg = delta; + +	if (stddev > 0) { +		int i = 0; +		/* +		 * stddev is the square of standard deviation but +		 * we want the actualy number. Use the average +		 * as our seed to find the std. +		 * +		 * The next try is: +		 *  x = (x + N/x) / 2 +		 * +		 * Where N is the squared number to find the square +		 * root of. +		 */ +		seed = avg; +		do { +			last_seed = seed; +			seed = stddev; +			if (!last_seed) +				break; +			do_div(seed, last_seed); +			seed += last_seed; +			do_div(seed, 2); +		} while (i++ < 10 && last_seed != seed); + +		std = seed; +	} + +	scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, +		  "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", +		  bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + +	bm_std = std; +	bm_avg = avg; +	bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ +	/* sleep a bit to make sure the tracepoint gets activated */ +	msleep(100); + +	while (!kthread_should_stop()) { + +		trace_do_benchmark(); + +		/* +		 * We don't go to sleep, but let others +		 * run as well. +		 */ +		cond_resched(); +	} + +	return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ +	bm_event_thread = kthread_run(benchmark_event_kthread, +				      NULL, "event_benchmark"); +	WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ +	if (!bm_event_thread) +		return; + +	kthread_stop(bm_event_thread); + +	strcpy(bm_str, "START"); +	bm_total = 0; +	bm_totalsq = 0; +	bm_last = 0; +	bm_max = 0; +	bm_min = 0; +	bm_cnt = 0; +	/* These don't need to be reset but reset them anyway */ +	bm_first = 0; +	bm_std = 0; +	bm_avg = 0; +	bm_stddev = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 00000000000..3c1df1df4e2 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN		128 + +TRACE_EVENT_FN(benchmark_event, + +	TP_PROTO(const char *str), + +	TP_ARGS(str), + +	TP_STRUCT__entry( +		__array(	char,	str,	BENCHMARK_EVENT_STRLEN	) +	), + +	TP_fast_assign( +		memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); +	), + +	TP_printk("%s", __entry->str), + +	trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5..697fb9bac8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  {  	struct ftrace_event_call *call = &event_branch;  	struct trace_array *tr = branch_tracer; +	struct trace_array_cpu *data;  	struct ring_buffer_event *event;  	struct trace_branch *entry;  	struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) +	data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	if (atomic_inc_return(&data->disabled) != 1)  		goto out;  	pc = preempt_count(); -	buffer = tr->buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,  					  sizeof(*entry), flags, pc);  	if (!event) @@ -76,11 +78,11 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	entry->line = f->line;  	entry->correct = val == expect; -	if (!filter_check_discard(call, entry, buffer, event)) -		ring_buffer_unlock_commit(buffer, event); +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event);   out: -	atomic_dec(&tr->data[cpu]->disabled); +	atomic_dec(&data->disabled);  	local_irq_restore(flags);  } @@ -199,7 +201,7 @@ __init static int init_branch_tracer(void)  	}  	return register_tracer(&branch_trace);  } -device_initcall(init_branch_tracer); +core_initcall(init_branch_tracer);  #else  static inline diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d55db..57b67b1f24d 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@  #include <linux/ktime.h>  #include <linux/trace_clock.h> -#include "trace.h" -  /*   * trace_clock_local(): the simplest and least coherent tracing clock.   * @@ -44,9 +42,10 @@ u64 notrace trace_clock_local(void)  	return clock;  } +EXPORT_SYMBOL_GPL(trace_clock_local);  /* - * trace_clock(): 'inbetween' trace clock. Not completely serialized, + * trace_clock(): 'between' trace clock. Not completely serialized,   * but not completely incorrect when crossing CPUs either.   *   * This is based on cpu_clock(), which will allow at most ~1 jiffy of @@ -58,6 +57,17 @@ u64 notrace trace_clock(void)  	return local_clock();  } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + * Note that this use of jiffies_64 is not completely safe on + * 32-bit systems. But the window is tiny, and the effect if + * we are affected is that we will have an obviously bogus + * timestamp on a trace event - i.e. not life threatening. + */ +u64 notrace trace_clock_jiffies(void) +{ +	return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); +}  /*   * trace_clock_global(): special globally coherent trace clock @@ -86,7 +96,7 @@ u64 notrace trace_clock_global(void)  	local_irq_save(flags);  	this_cpu = raw_smp_processor_id(); -	now = cpu_clock(this_cpu); +	now = sched_clock_cpu(this_cpu);  	/*  	 * If in an NMI context then dont risk lockups and return the  	 * cpu_clock() time: @@ -113,3 +123,15 @@ u64 notrace trace_clock_global(void)  	return now;  } + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ +	return atomic64_add_return(1, &trace_counter); +} diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e..e2d027ac66a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -27,7 +27,7 @@   *	  in the structure.   *   *   * for structures within structures, the format of the internal - *	structure is layed out. This allows the internal structure + *	structure is laid out. This allows the internal structure   *	to be deciphered for the format file. Although these macros   *	may become out of sync with the internal structure, they   *	will create a compile error if it happens. Since the @@ -53,9 +53,9 @@   */  /* - * Function trace entry - function address and parent function addres: + * Function trace entry - function address and parent function address:   */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry,  	TRACE_FN, @@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,  		__field(	unsigned long,	parent_ip	)  	), -	F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +	F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + +	FILTER_TRACE_FN, + +	perf_ftrace_event_register  );  /* Function call entry */ @@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,  		__field_desc(	int,		graph_ent,	depth		)  	), -	F_printk("--> %lx (%d)", __entry->func, __entry->depth) +	F_printk("--> %lx (%d)", __entry->func, __entry->depth), + +	FILTER_OTHER  );  /* Function return entry */ @@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,  	F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",  		 __entry->func, __entry->depth,  		 __entry->calltime, __entry->rettime, -		 __entry->depth) +		 __entry->depth), + +	FILTER_OTHER  );  /* @@ -109,12 +117,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,   */  #define FTRACE_CTX_FIELDS					\  	__field(	unsigned int,	prev_pid	)	\ +	__field(	unsigned int,	next_pid	)	\ +	__field(	unsigned int,	next_cpu	)       \  	__field(	unsigned char,	prev_prio	)	\  	__field(	unsigned char,	prev_state	)	\ -	__field(	unsigned int,	next_pid	)	\  	__field(	unsigned char,	next_prio	)	\ -	__field(	unsigned char,	next_state	)	\ -	__field(	unsigned int,	next_cpu	) +	__field(	unsigned char,	next_state	)  FTRACE_ENTRY(context_switch, ctx_switch_entry, @@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,  	F_printk("%u:%u:%u  ==> %u:%u:%u [%03u]",  		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,  		 __entry->next_pid, __entry->next_prio, __entry->next_state, -		 __entry->next_cpu -		) +		 __entry->next_cpu), + +	FILTER_OTHER  );  /* @@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,  	F_printk("%u:%u:%u  ==+ %u:%u:%u [%03u]",  		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,  		 __entry->next_pid, __entry->next_prio, __entry->next_state, -		 __entry->next_cpu -		) +		 __entry->next_cpu), + +	FILTER_OTHER  );  /* @@ -156,19 +166,29 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,  #define FTRACE_STACK_ENTRIES	8 +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif +  FTRACE_ENTRY(kernel_stack, stack_entry,  	TRACE_STACK,  	F_STRUCT( -		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	) +		__field(	int,		size	) +		__dynamic_array(unsigned long,	caller	)  	), -	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" -		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", +	F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" +		 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" +		 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",  		 __entry->caller[0], __entry->caller[1], __entry->caller[2],  		 __entry->caller[3], __entry->caller[4], __entry->caller[5], -		 __entry->caller[6], __entry->caller[7]) +		 __entry->caller[6], __entry->caller[7]), + +	FILTER_OTHER  );  FTRACE_ENTRY(user_stack, userstack_entry, @@ -180,11 +200,14 @@ FTRACE_ENTRY(user_stack, userstack_entry,  		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	)  	), -	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" -		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", +	F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" +		 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" +		 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",  		 __entry->caller[0], __entry->caller[1], __entry->caller[2],  		 __entry->caller[3], __entry->caller[4], __entry->caller[5], -		 __entry->caller[6], __entry->caller[7]) +		 __entry->caller[6], __entry->caller[7]), + +	FILTER_OTHER  );  /* @@ -200,8 +223,10 @@ FTRACE_ENTRY(bprint, bprint_entry,  		__dynamic_array(	u32,	buf	)  	), -	F_printk("%08lx fmt:%p", -		 __entry->ip, __entry->fmt) +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->fmt), + +	FILTER_OTHER  );  FTRACE_ENTRY(print, print_entry, @@ -213,8 +238,25 @@ FTRACE_ENTRY(print, print_entry,  		__dynamic_array(	char,	buf	)  	), -	F_printk("%08lx %s", -		 __entry->ip, __entry->buf) +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->buf), + +	FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + +	TRACE_BPUTS, + +	F_STRUCT( +		__field(	unsigned long,	ip	) +		__field(	const char *,	str	) +	), + +	F_printk("%pf: %s", +		 (void *)__entry->ip, __entry->str), + +	FILTER_OTHER  );  FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -233,7 +275,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,  	F_printk("%lx %lx %lx %d %x %x",  		 (unsigned long)__entry->phys, __entry->value, __entry->pc, -		 __entry->map_id, __entry->opcode, __entry->width) +		 __entry->map_id, __entry->opcode, __entry->width), + +	FILTER_OTHER  );  FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -251,7 +295,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,  	F_printk("%lx %lx %lx %d %x",  		 (unsigned long)__entry->phys, __entry->virt, __entry->len, -		 __entry->map_id, __entry->opcode) +		 __entry->map_id, __entry->opcode), + +	FILTER_OTHER  ); @@ -271,6 +317,8 @@ FTRACE_ENTRY(branch, trace_branch,  	F_printk("%u:%s:%s (%u)",  		 __entry->line, -		 __entry->func, __entry->file, __entry->correct) +		 __entry->func, __entry->file, __entry->correct), + +	FILTER_OTHER  ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670..5d12bb407b4 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -21,9 +21,59 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])  /* Count the events in use (per event id, not per instance) */  static int	total_ref_count; -static int perf_trace_event_init(struct ftrace_event_call *tp_event, +static int perf_trace_event_perm(struct ftrace_event_call *tp_event,  				 struct perf_event *p_event)  { +	if (tp_event->perf_perm) { +		int ret = tp_event->perf_perm(tp_event, p_event); +		if (ret) +			return ret; +	} + +	/* The ftrace function trace is allowed only for root. */ +	if (ftrace_event_is_function(tp_event)) { +		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) +			return -EPERM; + +		/* +		 * We don't allow user space callchains for  function trace +		 * event, due to issues with page faults while tracing page +		 * fault handler and its overall trickiness nature. +		 */ +		if (!p_event->attr.exclude_callchain_user) +			return -EINVAL; + +		/* +		 * Same reason to disable user stack dump as for user space +		 * callchains above. +		 */ +		if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) +			return -EINVAL; +	} + +	/* No tracing, just counting, so no obvious leak */ +	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) +		return 0; + +	/* Some events are ok to be traced by non-root users... */ +	if (p_event->attach_state == PERF_ATTACH_TASK) { +		if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) +			return 0; +	} + +	/* +	 * ...otherwise raw tracepoint data can be a severe data leak, +	 * only allow root to have these. +	 */ +	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	return 0; +} + +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, +				struct perf_event *p_event) +{  	struct hlist_head __percpu *list;  	int ret = -ENOMEM;  	int cpu; @@ -54,7 +104,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,  		}  	} -	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); +	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);  	if (ret)  		goto fail; @@ -79,10 +129,73 @@ fail:  	return ret;  } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ +	struct ftrace_event_call *tp_event = p_event->tp_event; +	int i; + +	if (--tp_event->perf_refcount > 0) +		goto out; + +	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + +	/* +	 * Ensure our callback won't be called anymore. The buffers +	 * will be freed after that. +	 */ +	tracepoint_synchronize_unregister(); + +	free_percpu(tp_event->perf_events); +	tp_event->perf_events = NULL; + +	if (!--total_ref_count) { +		for (i = 0; i < PERF_NR_CONTEXTS; i++) { +			free_percpu(perf_trace_buf[i]); +			perf_trace_buf[i] = NULL; +		} +	} +out: +	module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ +	struct ftrace_event_call *tp_event = p_event->tp_event; +	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ +	struct ftrace_event_call *tp_event = p_event->tp_event; +	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, +				 struct perf_event *p_event) +{ +	int ret; + +	ret = perf_trace_event_perm(tp_event, p_event); +	if (ret) +		return ret; + +	ret = perf_trace_event_reg(tp_event, p_event); +	if (ret) +		return ret; + +	ret = perf_trace_event_open(p_event); +	if (ret) { +		perf_trace_event_unreg(p_event); +		return ret; +	} + +	return 0; +} +  int perf_trace_init(struct perf_event *p_event)  {  	struct ftrace_event_call *tp_event; -	int event_id = p_event->attr.config; +	u64 event_id = p_event->attr.config;  	int ret = -EINVAL;  	mutex_lock(&event_mutex); @@ -101,6 +214,14 @@ int perf_trace_init(struct perf_event *p_event)  	return ret;  } +void perf_trace_destroy(struct perf_event *p_event) +{ +	mutex_lock(&event_mutex); +	perf_trace_event_close(p_event); +	perf_trace_event_unreg(p_event); +	mutex_unlock(&event_mutex); +} +  int perf_trace_add(struct perf_event *p_event, int flags)  {  	struct ftrace_event_call *tp_event = p_event->tp_event; @@ -117,47 +238,18 @@ int perf_trace_add(struct perf_event *p_event, int flags)  	list = this_cpu_ptr(pcpu_list);  	hlist_add_head_rcu(&p_event->hlist_entry, list); -	return 0; +	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);  }  void perf_trace_del(struct perf_event *p_event, int flags)  { -	hlist_del_rcu(&p_event->hlist_entry); -} - -void perf_trace_destroy(struct perf_event *p_event) -{  	struct ftrace_event_call *tp_event = p_event->tp_event; -	int i; - -	mutex_lock(&event_mutex); -	if (--tp_event->perf_refcount > 0) -		goto out; - -	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - -	/* -	 * Ensure our callback won't be called anymore. The buffers -	 * will be freed after that. -	 */ -	tracepoint_synchronize_unregister(); - -	free_percpu(tp_event->perf_events); -	tp_event->perf_events = NULL; - -	if (!--total_ref_count) { -		for (i = 0; i < PERF_NR_CONTEXTS; i++) { -			free_percpu(perf_trace_buf[i]); -			perf_trace_buf[i] = NULL; -		} -	} -out: -	module_put(tp_event->mod); -	mutex_unlock(&event_mutex); +	hlist_del_rcu(&p_event->hlist_entry); +	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);  } -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, -				       struct pt_regs *regs, int *rctxp) +void *perf_trace_buf_prepare(int size, unsigned short type, +			     struct pt_regs *regs, int *rctxp)  {  	struct trace_entry *entry;  	unsigned long flags; @@ -166,6 +258,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,  	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); +	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, +			"perf buffer not large enough")) +		return NULL; +  	pc = preempt_count();  	*rctxp = perf_swevent_get_recursion_context(); @@ -185,3 +281,90 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,  	return raw_data;  }  EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, +			  struct ftrace_ops *ops, struct pt_regs *pt_regs) +{ +	struct ftrace_entry *entry; +	struct hlist_head *head; +	struct pt_regs regs; +	int rctx; + +	head = this_cpu_ptr(event_function.perf_events); +	if (hlist_empty(head)) +		return; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ +		    sizeof(u64)) - sizeof(u32)) + +	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + +	perf_fetch_caller_regs(®s); + +	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); +	if (!entry) +		return; + +	entry->ip = ip; +	entry->parent_ip = parent_ip; +	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, +			      1, ®s, head, NULL); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ +	struct ftrace_ops *ops = &event->ftrace_ops; + +	ops->flags |= FTRACE_OPS_FL_CONTROL; +	ops->func = perf_ftrace_function_call; +	return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ +	struct ftrace_ops *ops = &event->ftrace_ops; +	int ret = unregister_ftrace_function(ops); +	ftrace_free_filter(ops); +	return ret; +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ +	ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ +	ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, +			       enum trace_reg type, void *data) +{ +	switch (type) { +	case TRACE_REG_REGISTER: +	case TRACE_REG_UNREGISTER: +		break; +	case TRACE_REG_PERF_REGISTER: +	case TRACE_REG_PERF_UNREGISTER: +		return 0; +	case TRACE_REG_PERF_OPEN: +		return perf_ftrace_function_register(data); +	case TRACE_REG_PERF_CLOSE: +		return perf_ftrace_function_unregister(data); +	case TRACE_REG_PERF_ADD: +		perf_ftrace_function_enable(data); +		return 0; +	case TRACE_REG_PERF_DEL: +		perf_ftrace_function_disable(data); +		return 0; +	} + +	return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab193..2de53628689 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,9 +28,44 @@  DEFINE_MUTEX(event_mutex);  LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); -struct list_head * +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +#define SYSTEM_FL_FREE_NAME		(1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ +	return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ +	return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ +	return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file)			\ +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ +		struct ftrace_event_file *___n;				\ +		list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file()		\ +	} + +static struct list_head *  trace_get_fields(struct ftrace_event_call *event_call)  {  	if (!event_call->class->get_fields) @@ -38,23 +73,45 @@ trace_get_fields(struct ftrace_event_call *event_call)  	return event_call->class->get_fields(event_call);  } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ +	struct ftrace_event_field *field; + +	list_for_each_entry(field, head, link) { +		if (!strcmp(field->name, name)) +			return field; +	} + +	return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ +	struct ftrace_event_field *field; +	struct list_head *head; + +	field = __find_event_field(&ftrace_common_fields, name); +	if (field) +		return field; + +	head = trace_get_fields(call); +	return __find_event_field(head, name); +} +  static int __trace_define_field(struct list_head *head, const char *type,  				const char *name, int offset, int size,  				int is_signed, int filter_type)  {  	struct ftrace_event_field *field; -	field = kzalloc(sizeof(*field), GFP_KERNEL); +	field = kmem_cache_alloc(field_cachep, GFP_TRACE);  	if (!field) -		goto err; - -	field->name = kstrdup(name, GFP_KERNEL); -	if (!field->name) -		goto err; +		return -ENOMEM; -	field->type = kstrdup(type, GFP_KERNEL); -	if (!field->type) -		goto err; +	field->name = name; +	field->type = type;  	if (filter_type == FILTER_OTHER)  		field->filter_type = filter_assign_type(type); @@ -68,13 +125,6 @@ static int __trace_define_field(struct list_head *head, const char *type,  	list_add(&field->link, head);  	return 0; - -err: -	if (field) -		kfree(field->name); -	kfree(field); - -	return -ENOMEM;  }  int trace_define_field(struct ftrace_event_call *call, const char *type, @@ -110,12 +160,11 @@ static int trace_define_common_fields(void)  	__common_field(unsigned char, flags);  	__common_field(unsigned char, preempt_count);  	__common_field(int, pid); -	__common_field(int, lock_depth);  	return ret;  } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call)  {  	struct ftrace_event_field *field, *next;  	struct list_head *head; @@ -123,9 +172,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)  	head = trace_get_fields(call);  	list_for_each_entry_safe(field, next, head, link) {  		list_del(&field->link); -		kfree(field->type); -		kfree(field->name); -		kfree(field); +		kmem_cache_free(field_cachep, field);  	}  } @@ -141,29 +188,68 @@ int trace_event_raw_init(struct ftrace_event_call *call)  }  EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, +				  struct ftrace_event_file *ftrace_file, +				  unsigned long len) +{ +	struct ftrace_event_call *event_call = ftrace_file->event_call; + +	local_save_flags(fbuffer->flags); +	fbuffer->pc = preempt_count(); +	fbuffer->ftrace_file = ftrace_file; + +	fbuffer->event = +		trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, +						event_call->event.type, len, +						fbuffer->flags, fbuffer->pc); +	if (!fbuffer->event) +		return NULL; + +	fbuffer->entry = ring_buffer_event_data(fbuffer->event); +	return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +{ +	event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, +				    fbuffer->event, fbuffer->entry, +				    fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); + +int ftrace_event_reg(struct ftrace_event_call *call, +		     enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; + +	WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));  	switch (type) {  	case TRACE_REG_REGISTER: -		return tracepoint_probe_register(call->name, +		return tracepoint_probe_register(call->tp,  						 call->class->probe, -						 call); +						 file);  	case TRACE_REG_UNREGISTER: -		tracepoint_probe_unregister(call->name, +		tracepoint_probe_unregister(call->tp,  					    call->class->probe, -					    call); +					    file);  		return 0;  #ifdef CONFIG_PERF_EVENTS  	case TRACE_REG_PERF_REGISTER: -		return tracepoint_probe_register(call->name, +		return tracepoint_probe_register(call->tp,  						 call->class->perf_probe,  						 call);  	case TRACE_REG_PERF_UNREGISTER: -		tracepoint_probe_unregister(call->name, +		tracepoint_probe_unregister(call->tp,  					    call->class->perf_probe,  					    call);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0; @@ -172,54 +258,108 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);  void trace_event_enable_cmd_record(bool enable)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) +	do_for_each_event_file(tr, file) { + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))  			continue;  		if (enable) {  			tracing_start_cmdline_record(); -			call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} else {  			tracing_stop_cmdline_record(); -			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  		} -	} +	} while_for_each_event_file();  	mutex_unlock(&event_mutex);  } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, -					int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, +					 int enable, int soft_disable)  { +	struct ftrace_event_call *call = file->event_call;  	int ret = 0; +	int disable;  	switch (enable) {  	case 0: -		if (call->flags & TRACE_EVENT_FL_ENABLED) { -			call->flags &= ~TRACE_EVENT_FL_ENABLED; -			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { +		/* +		 * When soft_disable is set and enable is cleared, the sm_ref +		 * reference counter is decremented. If it reaches 0, we want +		 * to clear the SOFT_DISABLED flag but leave the event in the +		 * state that it was. That is, if the event was enabled and +		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED +		 * is set we do not want the event to be enabled before we +		 * clear the bit. +		 * +		 * When soft_disable is not set but the SOFT_MODE flag is, +		 * we do nothing. Do not disable the tracepoint, otherwise +		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. +		 */ +		if (soft_disable) { +			if (atomic_dec_return(&file->sm_ref) > 0) +				break; +			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; +			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); +		} else +			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + +		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { +			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); +			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {  				tracing_stop_cmdline_record(); -				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; +				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			call->class->reg(call, TRACE_REG_UNREGISTER); +			call->class->reg(call, TRACE_REG_UNREGISTER, file);  		} +		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ +		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) +			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +		else +			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);  		break;  	case 1: -		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { +		/* +		 * When soft_disable is set and enable is set, we want to +		 * register the tracepoint for the event, but leave the event +		 * as is. That means, if the event was already enabled, we do +		 * nothing (but set SOFT_MODE). If the event is disabled, we +		 * set SOFT_DISABLED before enabling the event tracepoint, so +		 * it still seems to be disabled. +		 */ +		if (!soft_disable) +			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +		else { +			if (atomic_inc_return(&file->sm_ref) > 1) +				break; +			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); +		} + +		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + +			/* Keep the event disabled, when going to SOFT_MODE. */ +			if (soft_disable) +				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +  			if (trace_flags & TRACE_ITER_RECORD_CMD) {  				tracing_start_cmdline_record(); -				call->flags |= TRACE_EVENT_FL_RECORDED_CMD; +				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);  			} -			ret = call->class->reg(call, TRACE_REG_REGISTER); +			ret = call->class->reg(call, TRACE_REG_REGISTER, file);  			if (ret) {  				tracing_stop_cmdline_record();  				pr_info("event trace: Could not enable event " -					"%s\n", call->name); +					"%s\n", ftrace_event_name(call));  				break;  			} -			call->flags |= TRACE_EVENT_FL_ENABLED; +			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + +			/* WAS_ENABLED gets set but never cleared. */ +			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;  		}  		break;  	} @@ -227,53 +367,168 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,  	return ret;  } -static void ftrace_clear_events(void) +int trace_event_enable_disable(struct ftrace_event_file *file, +			       int enable, int soft_disable)  { -	struct ftrace_event_call *call; +	return __ftrace_event_enable_disable(file, enable, soft_disable); +} + +static int ftrace_event_enable_disable(struct ftrace_event_file *file, +				       int enable) +{ +	return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ +	struct ftrace_event_file *file;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		ftrace_event_enable_disable(call, 0); +	list_for_each_entry(file, &tr->events, list) { +		ftrace_event_enable_disable(file, 0);  	}  	mutex_unlock(&event_mutex);  } +static void __put_system(struct event_subsystem *system) +{ +	struct event_filter *filter = system->filter; + +	WARN_ON_ONCE(system_refcount(system) == 0); +	if (system_refcount_dec(system)) +		return; + +	list_del(&system->list); + +	if (filter) { +		kfree(filter->filter_string); +		kfree(filter); +	} +	if (system->ref_count & SYSTEM_FL_FREE_NAME) +		kfree(system->name); +	kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ +	WARN_ON_ONCE(system_refcount(system) == 0); +	system_refcount_inc(system); +} + +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	dir->ref_count++; +	__get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ +	WARN_ON_ONCE(dir->ref_count == 0); +	/* If the subsystem is about to be freed, the dir must be too */ +	WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); + +	__put_system(dir->subsystem); +	if (!--dir->ref_count) +		kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir) +{ +	mutex_lock(&event_mutex); +	__put_system_dir(dir); +	mutex_unlock(&event_mutex); +} + +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ +	if (!dir) +		return; + +	if (!--dir->nr_events) { +		debugfs_remove_recursive(dir->entry); +		list_del(&dir->list); +		__put_system_dir(dir); +	} +} + +static void remove_event_file_dir(struct ftrace_event_file *file) +{ +	struct dentry *dir = file->dir; +	struct dentry *child; + +	if (dir) { +		spin_lock(&dir->d_lock);	/* probably unneeded */ +		list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { +			if (child->d_inode)	/* probably unneeded */ +				child->d_inode->i_private = NULL; +		} +		spin_unlock(&dir->d_lock); + +		debugfs_remove_recursive(dir); +	} + +	list_del(&file->list); +	remove_subsystem(file->system); +	free_event_filter(file->filter); +	kmem_cache_free(file_cachep, file); +} +  /*   * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.   */ -static int __ftrace_set_clr_event(const char *match, const char *sub, -				  const char *event, int set) +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, +			      const char *sub, const char *event, int set)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call; +	const char *name;  	int ret = -EINVAL; -	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call; +		name = ftrace_event_name(call); -		if (!call->name || !call->class || !call->class->reg) +		if (!name || !call->class || !call->class->reg) +			continue; + +		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)  			continue;  		if (match && -		    strcmp(match, call->name) != 0 && +		    strcmp(match, name) != 0 &&  		    strcmp(match, call->class->system) != 0)  			continue;  		if (sub && strcmp(sub, call->class->system) != 0)  			continue; -		if (event && strcmp(event, call->name) != 0) +		if (event && strcmp(event, name) != 0)  			continue; -		ftrace_event_enable_disable(call, set); +		ftrace_event_enable_disable(file, set);  		ret = 0;  	} + +	return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, +				  const char *sub, const char *event, int set) +{ +	int ret; + +	mutex_lock(&event_mutex); +	ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);  	mutex_unlock(&event_mutex);  	return ret;  } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)  {  	char *event = NULL, *sub = NULL, *match; @@ -301,7 +556,7 @@ static int ftrace_set_clr_event(char *buf, int set)  			event = NULL;  	} -	return __ftrace_set_clr_event(match, sub, event, set); +	return __ftrace_set_clr_event(tr, match, sub, event, set);  }  /** @@ -318,8 +573,14 @@ static int ftrace_set_clr_event(char *buf, int set)   */  int trace_set_clr_event(const char *system, const char *event, int set)  { -	return __ftrace_set_clr_event(NULL, system, event, set); +	struct trace_array *tr = top_trace_array(); + +	if (!tr) +		return -ENODEV; + +	return __ftrace_set_clr_event(tr, NULL, system, event, set);  } +EXPORT_SYMBOL_GPL(trace_set_clr_event);  /* 128 should be much more than enough */  #define EVENT_BUF_SIZE		127 @@ -329,6 +590,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  {  	struct trace_parser parser; +	struct seq_file *m = file->private_data; +	struct trace_array *tr = m->private;  	ssize_t read, ret;  	if (!cnt) @@ -351,7 +614,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  		parser.buffer[parser.idx] = 0; -		ret = ftrace_set_clr_event(parser.buffer + !set, set); +		ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);  		if (ret)  			goto out_put;  	} @@ -367,17 +630,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  static void *  t_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { +	list_for_each_entry_continue(file, &tr->events, list) { +		call = file->event_call;  		/*  		 * The ftrace subsystem is for showing formats only.  		 * They can not be enabled or disabled via the event files.  		 */  		if (call->class && call->class->reg) -			return call; +			return file;  	}  	return NULL; @@ -385,30 +651,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)  static void *t_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = t_next(m, call, &l); -		if (!call) +		file = t_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static void *  s_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct trace_array *tr = m->private;  	(*pos)++; -	list_for_each_entry_continue(call, &ftrace_events, list) { -		if (call->flags & TRACE_EVENT_FL_ENABLED) -			return call; +	list_for_each_entry_continue(file, &tr->events, list) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) +			return file;  	}  	return NULL; @@ -416,27 +684,29 @@ s_next(struct seq_file *m, void *v, loff_t *pos)  static void *s_start(struct seq_file *m, loff_t *pos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = m->private;  	loff_t l;  	mutex_lock(&event_mutex); -	call = list_entry(&ftrace_events, struct ftrace_event_call, list); +	file = list_entry(&tr->events, struct ftrace_event_file, list);  	for (l = 0; l <= *pos; ) { -		call = s_next(m, call, &l); -		if (!call) +		file = s_next(m, file, &l); +		if (!file)  			break;  	} -	return call; +	return file;  }  static int t_show(struct seq_file *m, void *v)  { -	struct ftrace_event_call *call = v; +	struct ftrace_event_file *file = v; +	struct ftrace_event_call *call = file->event_call;  	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)  		seq_printf(m, "%s:", call->class->system); -	seq_printf(m, "%s\n", call->name); +	seq_printf(m, "%s\n", ftrace_event_name(call));  	return 0;  } @@ -446,53 +716,46 @@ static void t_stop(struct seq_file *m, void *p)  	mutex_unlock(&event_mutex);  } -static int -ftrace_event_seq_open(struct inode *inode, struct file *file) -{ -	const struct seq_operations *seq_ops; - -	if ((file->f_mode & FMODE_WRITE) && -	    (file->f_flags & O_TRUNC)) -		ftrace_clear_events(); - -	seq_ops = inode->i_private; -	return seq_open(file, seq_ops); -} -  static ssize_t  event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; -	char *buf; +	struct ftrace_event_file *file; +	unsigned long flags; +	char buf[4] = "0"; -	if (call->flags & TRACE_EVENT_FL_ENABLED) -		buf = "1\n"; -	else -		buf = "0\n"; +	mutex_lock(&event_mutex); +	file = event_file_data(filp); +	if (likely(file)) +		flags = file->flags; +	mutex_unlock(&event_mutex); + +	if (!file) +		return -ENODEV; + +	if (flags & FTRACE_EVENT_FL_ENABLED && +	    !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		strcpy(buf, "1"); + +	if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || +	    flags & FTRACE_EVENT_FL_SOFT_MODE) +		strcat(buf, "*"); + +	strcat(buf, "\n"); -	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));  }  static ssize_t  event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; -	char buf[64]; +	struct ftrace_event_file *file;  	unsigned long val;  	int ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret)  		return ret;  	ret = tracing_update_buffers(); @@ -502,8 +765,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	switch (val) {  	case 0:  	case 1: +		ret = -ENODEV;  		mutex_lock(&event_mutex); -		ret = ftrace_event_enable_disable(call, val); +		file = event_file_data(filp); +		if (likely(file)) +			ret = ftrace_event_enable_disable(file, val);  		mutex_unlock(&event_mutex);  		break; @@ -521,18 +787,22 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		   loff_t *ppos)  {  	const char set_to_char[4] = { '?', '0', '1', 'X' }; -	const char *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct ftrace_event_call *call; +	struct ftrace_event_file *file; +	struct trace_array *tr = dir->tr;  	char buf[2];  	int set = 0;  	int ret;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		if (!call->name || !call->class || !call->class->reg) +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call; +		if (!ftrace_event_name(call) || !call->class || !call->class->reg)  			continue; -		if (system && strcmp(call->class->system, system) != 0) +		if (system && strcmp(call->class->system, system->name) != 0)  			continue;  		/* @@ -540,7 +810,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		 * or if all events or cleared, or if we have  		 * a mixture.  		 */ -		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); +		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));  		/*  		 * If we have a mixture, no need to look further. @@ -562,21 +832,14 @@ static ssize_t  system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		    loff_t *ppos)  { -	const char *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem; +	const char *name = NULL;  	unsigned long val; -	char buf[64];  	ssize_t ret; -	if (cnt >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, cnt)) -		return -EFAULT; - -	buf[cnt] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret)  		return ret;  	ret = tracing_update_buffers(); @@ -586,7 +849,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	if (val != 0 && val != 1)  		return -EINVAL; -	ret = __ftrace_set_clr_event(NULL, system, NULL, val); +	/* +	 * Opening of "enable" adds a ref count to system, +	 * so the name is safe to use. +	 */ +	if (system) +		name = system->name; + +	ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);  	if (ret)  		goto out; @@ -606,71 +876,45 @@ enum {  static void *f_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = m->private; -	struct ftrace_event_field *field; +	struct ftrace_event_call *call = event_file_data(m->private);  	struct list_head *common_head = &ftrace_common_fields;  	struct list_head *head = trace_get_fields(call); +	struct list_head *node = v;  	(*pos)++;  	switch ((unsigned long)v) {  	case FORMAT_HEADER: -		if (unlikely(list_empty(common_head))) -			return NULL; - -		field = list_entry(common_head->prev, -				   struct ftrace_event_field, link); -		return field; +		node = common_head; +		break;  	case FORMAT_FIELD_SEPERATOR: -		if (unlikely(list_empty(head))) -			return NULL; - -		field = list_entry(head->prev, struct ftrace_event_field, link); -		return field; +		node = head; +		break;  	case FORMAT_PRINTFMT:  		/* all done */  		return NULL;  	} -	field = v; -	if (field->link.prev == common_head) +	node = node->prev; +	if (node == common_head)  		return (void *)FORMAT_FIELD_SEPERATOR; -	else if (field->link.prev == head) +	else if (node == head)  		return (void *)FORMAT_PRINTFMT; - -	field = list_entry(field->link.prev, struct ftrace_event_field, link); - -	return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ -	loff_t l = 0; -	void *p; - -	/* Start by showing the header */ -	if (!*pos) -		return (void *)FORMAT_HEADER; - -	p = (void *)FORMAT_HEADER; -	do { -		p = f_next(m, p, &l); -	} while (p && l < *pos); - -	return p; +	else +		return node;  }  static int f_show(struct seq_file *m, void *v)  { -	struct ftrace_event_call *call = m->private; +	struct ftrace_event_call *call = event_file_data(m->private);  	struct ftrace_event_field *field;  	const char *array_descriptor;  	switch ((unsigned long)v) {  	case FORMAT_HEADER: -		seq_printf(m, "name: %s\n", call->name); +		seq_printf(m, "name: %s\n", ftrace_event_name(call));  		seq_printf(m, "ID: %d\n", call->event.type);  		seq_printf(m, "format:\n");  		return 0; @@ -685,8 +929,7 @@ static int f_show(struct seq_file *m, void *v)  		return 0;  	} -	field = v; - +	field = list_entry(v, struct ftrace_event_field, link);  	/*  	 * Smartly shows the array type(except dynamic array).  	 * Normal: @@ -713,8 +956,25 @@ static int f_show(struct seq_file *m, void *v)  	return 0;  } +static void *f_start(struct seq_file *m, loff_t *pos) +{ +	void *p = (void *)FORMAT_HEADER; +	loff_t l = 0; + +	/* ->stop() is called even if ->start() fails */ +	mutex_lock(&event_mutex); +	if (!event_file_data(m->private)) +		return ERR_PTR(-ENODEV); + +	while (l < *pos && p) +		p = f_next(m, p, &l); + +	return p; +} +  static void f_stop(struct seq_file *m, void *p)  { +	mutex_unlock(&event_mutex);  }  static const struct seq_operations trace_format_seq_ops = { @@ -726,7 +986,6 @@ static const struct seq_operations trace_format_seq_ops = {  static int trace_format_open(struct inode *inode, struct file *file)  { -	struct ftrace_event_call *call = inode->i_private;  	struct seq_file *m;  	int ret; @@ -735,7 +994,7 @@ static int trace_format_open(struct inode *inode, struct file *file)  		return ret;  	m = file->private_data; -	m->private = call; +	m->private = file;  	return 0;  } @@ -743,45 +1002,47 @@ static int trace_format_open(struct inode *inode, struct file *file)  static ssize_t  event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; -	struct trace_seq *s; -	int r; +	int id = (long)event_file_data(filp); +	char buf[32]; +	int len;  	if (*ppos)  		return 0; -	s = kmalloc(sizeof(*s), GFP_KERNEL); -	if (!s) -		return -ENOMEM; +	if (unlikely(!id)) +		return -ENODEV; -	trace_seq_init(s); -	trace_seq_printf(s, "%d\n", call->event.type); +	len = sprintf(buf, "%d\n", id); -	r = simple_read_from_buffer(ubuf, cnt, ppos, -				    s->buffer, s->len); -	kfree(s); -	return r; +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);  }  static ssize_t  event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file;  	struct trace_seq *s; -	int r; +	int r = -ENODEV;  	if (*ppos)  		return 0;  	s = kmalloc(sizeof(*s), GFP_KERNEL); +  	if (!s)  		return -ENOMEM;  	trace_seq_init(s); -	print_event_filter(call, s); -	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); +	mutex_lock(&event_mutex); +	file = event_file_data(filp); +	if (file) +		print_event_filter(file, s); +	mutex_unlock(&event_mutex); + +	if (file) +		r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);  	kfree(s); @@ -792,9 +1053,9 @@ static ssize_t  event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_file *file;  	char *buf; -	int err; +	int err = -ENODEV;  	if (cnt >= PAGE_SIZE)  		return -EINVAL; @@ -809,7 +1070,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	}  	buf[cnt] = '\0'; -	err = apply_event_filter(call, buf); +	mutex_lock(&event_mutex); +	file = event_file_data(filp); +	if (file) +		err = apply_event_filter(file, buf); +	mutex_unlock(&event_mutex); +  	free_page((unsigned long) buf);  	if (err < 0)  		return err; @@ -819,11 +1085,116 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	return cnt;  } +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ +	struct event_subsystem *system = NULL; +	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ +	struct trace_array *tr; +	int ret; + +	if (tracing_is_disabled()) +		return -ENODEV; + +	/* Make sure the system still exists */ +	mutex_lock(&trace_types_lock); +	mutex_lock(&event_mutex); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		list_for_each_entry(dir, &tr->systems, list) { +			if (dir == inode->i_private) { +				/* Don't open systems with no events */ +				if (dir->nr_events) { +					__get_system_dir(dir); +					system = dir->subsystem; +				} +				goto exit_loop; +			} +		} +	} + exit_loop: +	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock); + +	if (!system) +		return -ENODEV; + +	/* Some versions of gcc think dir can be uninitialized here */ +	WARN_ON(!dir); + +	/* Still need to increment the ref count of the system */ +	if (trace_array_get(tr) < 0) { +		put_system(dir); +		return -ENODEV; +	} + +	ret = tracing_open_generic(inode, filp); +	if (ret < 0) { +		trace_array_put(tr); +		put_system(dir); +	} + +	return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ +	struct ftrace_subsystem_dir *dir; +	struct trace_array *tr = inode->i_private; +	int ret; + +	if (tracing_is_disabled()) +		return -ENODEV; + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	/* Make a temporary dir that has no system but points to tr */ +	dir = kzalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) { +		trace_array_put(tr); +		return -ENOMEM; +	} + +	dir->tr = tr; + +	ret = tracing_open_generic(inode, filp); +	if (ret < 0) { +		trace_array_put(tr); +		kfree(dir); +		return ret; +	} + +	filp->private_data = dir; + +	return 0; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ +	struct ftrace_subsystem_dir *dir = file->private_data; + +	trace_array_put(dir->tr); + +	/* +	 * If dir->subsystem is NULL, then this is a temporary +	 * descriptor that was made for a trace_array to enable +	 * all subsystems. +	 */ +	if (dir->subsystem) +		put_system(dir); +	else +		kfree(dir); + +	return 0; +} +  static ssize_t  subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		      loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data; +	struct event_subsystem *system = dir->subsystem;  	struct trace_seq *s;  	int r; @@ -848,7 +1219,7 @@ static ssize_t  subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		       loff_t *ppos)  { -	struct event_subsystem *system = filp->private_data; +	struct ftrace_subsystem_dir *dir = filp->private_data;  	char *buf;  	int err; @@ -865,7 +1236,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	}  	buf[cnt] = '\0'; -	err = apply_subsystem_event_filter(system, buf); +	err = apply_subsystem_event_filter(dir, buf);  	free_page((unsigned long) buf);  	if (err < 0)  		return err; @@ -899,6 +1270,10 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)  	return r;  } +static int ftrace_event_avail_open(struct inode *inode, struct file *file); +static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); +  static const struct seq_operations show_event_seq_ops = {  	.start = t_start,  	.next = t_next, @@ -914,18 +1289,18 @@ static const struct seq_operations show_set_event_seq_ops = {  };  static const struct file_operations ftrace_avail_fops = { -	.open = ftrace_event_seq_open, +	.open = ftrace_event_avail_open,  	.read = seq_read,  	.llseek = seq_lseek,  	.release = seq_release,  };  static const struct file_operations ftrace_set_event_fops = { -	.open = ftrace_event_seq_open, +	.open = ftrace_event_set_open,  	.read = seq_read,  	.write = ftrace_event_write,  	.llseek = seq_lseek, -	.release = seq_release, +	.release = ftrace_event_release,  };  static const struct file_operations ftrace_enable_fops = { @@ -943,7 +1318,6 @@ static const struct file_operations ftrace_event_format_fops = {  };  static const struct file_operations ftrace_event_id_fops = { -	.open = tracing_open_generic,  	.read = event_id_read,  	.llseek = default_llseek,  }; @@ -956,17 +1330,27 @@ static const struct file_operations ftrace_event_filter_fops = {  };  static const struct file_operations ftrace_subsystem_filter_fops = { -	.open = tracing_open_generic, +	.open = subsystem_open,  	.read = subsystem_filter_read,  	.write = subsystem_filter_write,  	.llseek = default_llseek, +	.release = subsystem_release,  };  static const struct file_operations ftrace_system_enable_fops = { -	.open = tracing_open_generic, +	.open = subsystem_open,  	.read = system_enable_read,  	.write = system_enable_write,  	.llseek = default_llseek, +	.release = subsystem_release, +}; + +static const struct file_operations ftrace_tr_enable_fops = { +	.open = system_tr_open, +	.read = system_enable_read, +	.write = system_enable_write, +	.llseek = default_llseek, +	.release = subsystem_release,  };  static const struct file_operations ftrace_show_header_fops = { @@ -975,125 +1359,212 @@ static const struct file_operations ftrace_show_header_fops = {  	.llseek = default_llseek,  }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, +		  const struct seq_operations *seq_ops)  { -	static struct dentry *d_tracer; -	static struct dentry *d_events; +	struct seq_file *m; +	int ret; -	if (d_events) -		return d_events; +	ret = seq_open(file, seq_ops); +	if (ret < 0) +		return ret; +	m = file->private_data; +	/* copy tr over to seq ops */ +	m->private = inode->i_private; -	d_tracer = tracing_init_dentry(); -	if (!d_tracer) -		return NULL; +	return ret; +} -	d_events = debugfs_create_dir("events", d_tracer); -	if (!d_events) -		pr_warning("Could not create debugfs " -			   "'events' directory\n"); +static int ftrace_event_release(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; + +	trace_array_put(tr); -	return d_events; +	return seq_release(inode, file);  } -static LIST_HEAD(event_subsystems); +static int +ftrace_event_avail_open(struct inode *inode, struct file *file) +{ +	const struct seq_operations *seq_ops = &show_event_seq_ops; + +	return ftrace_event_open(inode, file, seq_ops); +} + +static int +ftrace_event_set_open(struct inode *inode, struct file *file) +{ +	const struct seq_operations *seq_ops = &show_set_event_seq_ops; +	struct trace_array *tr = inode->i_private; +	int ret; + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	if ((file->f_mode & FMODE_WRITE) && +	    (file->f_flags & O_TRUNC)) +		ftrace_clear_events(tr); + +	ret = ftrace_event_open(inode, file, seq_ops); +	if (ret < 0) +		trace_array_put(tr); +	return ret; +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ +	struct event_subsystem *system; + +	/* need to create new entry */ +	system = kmalloc(sizeof(*system), GFP_KERNEL); +	if (!system) +		return NULL; + +	system->ref_count = 1; + +	/* Only allocate if dynamic (kprobes and modules) */ +	if (!core_kernel_data((unsigned long)name)) { +		system->ref_count |= SYSTEM_FL_FREE_NAME; +		system->name = kstrdup(name, GFP_KERNEL); +		if (!system->name) +			goto out_free; +	} else +		system->name = name; + +	system->filter = NULL; + +	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); +	if (!system->filter) +		goto out_free; + +	list_add(&system->list, &event_subsystems); + +	return system; + + out_free: +	if (system->ref_count & SYSTEM_FL_FREE_NAME) +		kfree(system->name); +	kfree(system); +	return NULL; +}  static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, +		    struct ftrace_event_file *file, struct dentry *parent)  { +	struct ftrace_subsystem_dir *dir;  	struct event_subsystem *system;  	struct dentry *entry;  	/* First see if we did not already create this dir */ -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { +		system = dir->subsystem;  		if (strcmp(system->name, name) == 0) { -			system->nr_events++; -			return system->entry; +			dir->nr_events++; +			file->system = dir; +			return dir->entry;  		}  	} -	/* need to create new entry */ -	system = kmalloc(sizeof(*system), GFP_KERNEL); -	if (!system) { -		pr_warning("No memory to create event subsystem %s\n", -			   name); -		return d_events; +	/* Now see if the system itself exists. */ +	list_for_each_entry(system, &event_subsystems, list) { +		if (strcmp(system->name, name) == 0) +			break;  	} +	/* Reset system variable when not found */ +	if (&system->list == &event_subsystems) +		system = NULL; -	system->entry = debugfs_create_dir(name, d_events); -	if (!system->entry) { -		pr_warning("Could not create event subsystem %s\n", -			   name); -		kfree(system); -		return d_events; -	} +	dir = kmalloc(sizeof(*dir), GFP_KERNEL); +	if (!dir) +		goto out_fail; -	system->nr_events = 1; -	system->name = kstrdup(name, GFP_KERNEL); -	if (!system->name) { -		debugfs_remove(system->entry); -		kfree(system); -		return d_events; +	if (!system) { +		system = create_new_subsystem(name); +		if (!system) +			goto out_free; +	} else +		__get_system(system); + +	dir->entry = debugfs_create_dir(name, parent); +	if (!dir->entry) { +		pr_warning("Failed to create system directory %s\n", name); +		__put_system(system); +		goto out_free;  	} -	list_add(&system->list, &event_subsystems); - -	system->filter = NULL; +	dir->tr = tr; +	dir->ref_count = 1; +	dir->nr_events = 1; +	dir->subsystem = system; +	file->system = dir; -	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); -	if (!system->filter) { -		pr_warning("Could not allocate filter for subsystem " -			   "'%s'\n", name); -		return system->entry; -	} - -	entry = debugfs_create_file("filter", 0644, system->entry, system, +	entry = debugfs_create_file("filter", 0644, dir->entry, dir,  				    &ftrace_subsystem_filter_fops);  	if (!entry) {  		kfree(system->filter);  		system->filter = NULL; -		pr_warning("Could not create debugfs " -			   "'%s/filter' entry\n", name); +		pr_warning("Could not create debugfs '%s/filter' entry\n", name);  	} -	trace_create_file("enable", 0644, system->entry, -			  (void *)system->name, +	trace_create_file("enable", 0644, dir->entry, dir,  			  &ftrace_system_enable_fops); -	return system->entry; +	list_add(&dir->list, &tr->systems); + +	return dir->entry; + + out_free: +	kfree(dir); + out_fail: +	/* Only print this message if failed on memory allocation */ +	if (!dir || !system) +		pr_warning("No memory to create event subsystem %s\n", +			   name); +	return NULL;  }  static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, -		 const struct file_operations *id, -		 const struct file_operations *enable, -		 const struct file_operations *filter, -		 const struct file_operations *format) +event_create_dir(struct dentry *parent, struct ftrace_event_file *file)  { +	struct ftrace_event_call *call = file->event_call; +	struct trace_array *tr = file->tr;  	struct list_head *head; +	struct dentry *d_events; +	const char *name;  	int ret;  	/*  	 * If the trace point header did not define TRACE_SYSTEM  	 * then the system would be called "TRACE_SYSTEM".  	 */ -	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) -		d_events = event_subsystem_dir(call->class->system, d_events); - -	call->dir = debugfs_create_dir(call->name, d_events); -	if (!call->dir) { -		pr_warning("Could not create debugfs " -			   "'%s' directory\n", call->name); +	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { +		d_events = event_subsystem_dir(tr, call->class->system, file, parent); +		if (!d_events) +			return -ENOMEM; +	} else +		d_events = parent; + +	name = ftrace_event_name(call); +	file->dir = debugfs_create_dir(name, d_events); +	if (!file->dir) { +		pr_warning("Could not create debugfs '%s' directory\n", +			   name);  		return -1;  	} -	if (call->class->reg) -		trace_create_file("enable", 0644, call->dir, call, -				  enable); +	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) +		trace_create_file("enable", 0644, file->dir, file, +				  &ftrace_enable_fops);  #ifdef CONFIG_PERF_EVENTS  	if (call->event.type && call->class->reg) -		trace_create_file("id", 0444, call->dir, call, -		 		  id); +		trace_create_file("id", 0444, file->dir, +				  (void *)(long)call->event.type, +				  &ftrace_event_id_fops);  #endif  	/* @@ -1105,229 +1576,286 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,  		ret = call->class->define_fields(call);  		if (ret < 0) {  			pr_warning("Could not initialize trace point" -				   " events/%s\n", call->name); -			return ret; +				   " events/%s\n", name); +			return -1;  		}  	} -	trace_create_file("filter", 0644, call->dir, call, -			  filter); +	trace_create_file("filter", 0644, file->dir, file, +			  &ftrace_event_filter_fops); + +	trace_create_file("trigger", 0644, file->dir, file, +			  &event_trigger_fops); -	trace_create_file("format", 0444, call->dir, call, -			  format); +	trace_create_file("format", 0444, file->dir, call, +			  &ftrace_event_format_fops);  	return 0;  } -static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, -		       const struct file_operations *id, -		       const struct file_operations *enable, -		       const struct file_operations *filter, -		       const struct file_operations *format) +static void remove_event_from_tracers(struct ftrace_event_call *call)  { -	struct dentry *d_events; -	int ret; +	struct ftrace_event_file *file; +	struct trace_array *tr; + +	do_for_each_event_file_safe(tr, file) { +		if (file->event_call != call) +			continue; + +		remove_event_file_dir(file); +		/* +		 * The do_for_each_event_file_safe() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); +} + +static void event_remove(struct ftrace_event_call *call) +{ +	struct trace_array *tr; +	struct ftrace_event_file *file; -	/* The linker may leave blanks */ -	if (!call->name) +	do_for_each_event_file(tr, file) { +		if (file->event_call != call) +			continue; +		ftrace_event_enable_disable(file, 0); +		destroy_preds(file); +		/* +		 * The do_for_each_event_file() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); + +	if (call->event.funcs) +		__unregister_ftrace_event(&call->event); +	remove_event_from_tracers(call); +	list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ +	int ret = 0; +	const char *name; + +	name = ftrace_event_name(call); +	if (WARN_ON(!name))  		return -EINVAL;  	if (call->class->raw_init) {  		ret = call->class->raw_init(call); -		if (ret < 0) { -			if (ret != -ENOSYS) -				pr_warning("Could not initialize trace events/%s\n", -					   call->name); -			return ret; -		} +		if (ret < 0 && ret != -ENOSYS) +			pr_warn("Could not initialize trace events/%s\n", +				name);  	} -	d_events = event_trace_events_dir(); -	if (!d_events) -		return -ENOENT; +	return ret; +} + +static int +__register_event(struct ftrace_event_call *call, struct module *mod) +{ +	int ret; + +	ret = event_init(call); +	if (ret < 0) +		return ret; -	ret = event_create_dir(call, d_events, id, enable, filter, format); -	if (!ret) -		list_add(&call->list, &ftrace_events); +	list_add(&call->list, &ftrace_events);  	call->mod = mod; -	return ret; +	return 0;  } -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct ftrace_event_call *call) +static struct ftrace_event_file * +trace_create_new_event(struct ftrace_event_call *call, +		       struct trace_array *tr)  { -	int ret; -	mutex_lock(&event_mutex); -	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, -				     &ftrace_enable_fops, -				     &ftrace_event_filter_fops, -				     &ftrace_event_format_fops); -	mutex_unlock(&event_mutex); -	return ret; +	struct ftrace_event_file *file; + +	file = kmem_cache_alloc(file_cachep, GFP_TRACE); +	if (!file) +		return NULL; + +	file->event_call = call; +	file->tr = tr; +	atomic_set(&file->sm_ref, 0); +	atomic_set(&file->tm_ref, 0); +	INIT_LIST_HEAD(&file->triggers); +	list_add(&file->list, &tr->events); + +	return file;  } -static void remove_subsystem_dir(const char *name) +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)  { -	struct event_subsystem *system; +	struct ftrace_event_file *file; -	if (strcmp(name, TRACE_SYSTEM) == 0) -		return; +	file = trace_create_new_event(call, tr); +	if (!file) +		return -ENOMEM; -	list_for_each_entry(system, &event_subsystems, list) { -		if (strcmp(system->name, name) == 0) { -			if (!--system->nr_events) { -				struct event_filter *filter = system->filter; - -				debugfs_remove_recursive(system->entry); -				list_del(&system->list); -				if (filter) { -					kfree(filter->filter_string); -					kfree(filter); -				} -				kfree(system->name); -				kfree(system); -			} -			break; -		} -	} +	return event_create_dir(tr->event_dir, file);  }  /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized.   */ -static void __trace_remove_event_call(struct ftrace_event_call *call) +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, +			    struct trace_array *tr)  { -	ftrace_event_enable_disable(call, 0); -	if (call->event.funcs) -		__unregister_ftrace_event(&call->event); -	debugfs_remove_recursive(call->dir); -	list_del(&call->list); -	trace_destroy_fields(call); -	destroy_preds(call); -	remove_subsystem_dir(call->class->system); +	struct ftrace_event_file *file; + +	file = trace_create_new_event(call, tr); +	if (!file) +		return -ENOMEM; + +	return 0;  } -/* Remove an event_call */ -void trace_remove_event_call(struct ftrace_event_call *call) +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call); + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call)  { +	int ret; +	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex); -	down_write(&trace_event_mutex); -	__trace_remove_event_call(call); -	up_write(&trace_event_mutex); -	mutex_unlock(&event_mutex); -} - -#define for_each_event(event, start, end)			\ -	for (event = start;					\ -	     (unsigned long)event < (unsigned long)end;		\ -	     event++) -#ifdef CONFIG_MODULES +	ret = __register_event(call, NULL); +	if (ret >= 0) +		__add_event_to_tracers(call); -static LIST_HEAD(ftrace_module_file_list); +	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock); +	return ret; +}  /* - * Modules must own their file_operations to keep up with - * reference counting. + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem.   */ -struct ftrace_module_file_ops { -	struct list_head		list; -	struct module			*mod; -	struct file_operations		id; -	struct file_operations		enable; -	struct file_operations		format; -	struct file_operations		filter; -}; - -static struct ftrace_module_file_ops * -trace_create_file_ops(struct module *mod) +static void __trace_remove_event_call(struct ftrace_event_call *call)  { -	struct ftrace_module_file_ops *file_ops; +	event_remove(call); +	trace_destroy_fields(call); +	destroy_call_preds(call); +} -	/* -	 * This is a bit of a PITA. To allow for correct reference -	 * counting, modules must "own" their file_operations. -	 * To do this, we allocate the file operations that will be -	 * used in the event directory. -	 */ +static int probe_remove_event_call(struct ftrace_event_call *call) +{ +	struct trace_array *tr; +	struct ftrace_event_file *file; -	file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); -	if (!file_ops) -		return NULL; +#ifdef CONFIG_PERF_EVENTS +	if (call->perf_refcount) +		return -EBUSY; +#endif +	do_for_each_event_file(tr, file) { +		if (file->event_call != call) +			continue; +		/* +		 * We can't rely on ftrace_event_enable_disable(enable => 0) +		 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress +		 * TRACE_REG_UNREGISTER. +		 */ +		if (file->flags & FTRACE_EVENT_FL_ENABLED) +			return -EBUSY; +		/* +		 * The do_for_each_event_file_safe() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); -	file_ops->mod = mod; +	__trace_remove_event_call(call); -	file_ops->id = ftrace_event_id_fops; -	file_ops->id.owner = mod; +	return 0; +} -	file_ops->enable = ftrace_enable_fops; -	file_ops->enable.owner = mod; +/* Remove an event_call */ +int trace_remove_event_call(struct ftrace_event_call *call) +{ +	int ret; -	file_ops->filter = ftrace_event_filter_fops; -	file_ops->filter.owner = mod; +	mutex_lock(&trace_types_lock); +	mutex_lock(&event_mutex); +	down_write(&trace_event_sem); +	ret = probe_remove_event_call(call); +	up_write(&trace_event_sem); +	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock); -	file_ops->format = ftrace_event_format_fops; -	file_ops->format.owner = mod; +	return ret; +} -	list_add(&file_ops->list, &ftrace_module_file_list); +#define for_each_event(event, start, end)			\ +	for (event = start;					\ +	     (unsigned long)event < (unsigned long)end;		\ +	     event++) -	return file_ops; -} +#ifdef CONFIG_MODULES  static void trace_module_add_events(struct module *mod)  { -	struct ftrace_module_file_ops *file_ops = NULL; -	struct ftrace_event_call *call, *start, *end; +	struct ftrace_event_call **call, **start, **end; -	start = mod->trace_events; -	end = mod->trace_events + mod->num_trace_events; - -	if (start == end) +	if (!mod->num_trace_events)  		return; -	file_ops = trace_create_file_ops(mod); -	if (!file_ops) +	/* Don't add infrastructure for mods without tracepoints */ +	if (trace_module_has_bad_taint(mod)) { +		pr_err("%s: module has bad taint, not creating trace events\n", +		       mod->name);  		return; +	} + +	start = mod->trace_events; +	end = mod->trace_events + mod->num_trace_events;  	for_each_event(call, start, end) { -		__trace_add_event_call(call, mod, -				       &file_ops->id, &file_ops->enable, -				       &file_ops->filter, &file_ops->format); +		__register_event(*call, mod); +		__add_event_to_tracers(*call);  	}  }  static void trace_module_remove_events(struct module *mod)  { -	struct ftrace_module_file_ops *file_ops;  	struct ftrace_event_call *call, *p; -	bool found = false; +	bool clear_trace = false; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	list_for_each_entry_safe(call, p, &ftrace_events, list) {  		if (call->mod == mod) { -			found = true; +			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) +				clear_trace = true;  			__trace_remove_event_call(call);  		}  	} - -	/* Now free the file_operations */ -	list_for_each_entry(file_ops, &ftrace_module_file_list, list) { -		if (file_ops->mod == mod) -			break; -	} -	if (&file_ops->list != &ftrace_module_file_list) { -		list_del(&file_ops->list); -		kfree(file_ops); -	} +	up_write(&trace_event_sem);  	/*  	 * It is safest to reset the ring buffer if the module being unloaded -	 * registered any events. +	 * registered any events that were used. The only worry is if +	 * a new module gets loaded, and takes on the same id as the events +	 * of this module. When printing out the buffer, traced events left +	 * over from this module may be passed to the new module events and +	 * unexpected results may occur.  	 */ -	if (found) -		tracing_reset_current_online_cpus(); -	up_write(&trace_event_mutex); +	if (clear_trace) +		tracing_reset_all_online_cpus();  }  static int trace_module_notify(struct notifier_block *self, @@ -1335,6 +1863,7 @@ static int trace_module_notify(struct notifier_block *self,  {  	struct module *mod = data; +	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex);  	switch (val) {  	case MODULE_STATE_COMING: @@ -1345,68 +1874,421 @@ static int trace_module_notify(struct notifier_block *self,  		break;  	}  	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock);  	return 0;  } -#else -static int trace_module_notify(struct notifier_block *self, -			       unsigned long val, void *data) -{ -	return 0; -} -#endif /* CONFIG_MODULES */  static struct notifier_block trace_module_nb = {  	.notifier_call = trace_module_notify,  	.priority = 0,  }; +#endif /* CONFIG_MODULES */ + +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		ret = __trace_add_new_event(call, tr); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   ftrace_event_name(call)); +	} +} + +struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system,  const char *event) +{ +	struct ftrace_event_file *file; +	struct ftrace_event_call *call; +	const char *name; + +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call; +		name = ftrace_event_name(call); + +		if (!name || !call->class || !call->class->reg) +			continue; + +		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) +			continue; + +		if (strcmp(event, name) == 0 && +		    strcmp(system, call->class->system) == 0) +			return file; +	} +	return NULL; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR	"enable_event" +#define DISABLE_EVENT_STR	"disable_event" + +struct event_probe_data { +	struct ftrace_event_file	*file; +	unsigned long			count; +	int				ref; +	bool				enable; +}; + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (data->enable) +		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +	else +		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (!data) +		return; + +	if (!data->count) +		return; + +	/* Skip if the event is in a state we want to switch to */ +	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		return; + +	if (data->count != -1) +		(data->count)--; + +	event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, +		      struct ftrace_probe_ops *ops, void *_data) +{ +	struct event_probe_data *data = _data; -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; +	seq_printf(m, "%ps:", (void *)ip); + +	seq_printf(m, "%s:%s:%s", +		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, +		   data->file->event_call->class->system, +		   ftrace_event_name(data->file->event_call)); + +	if (data->count == -1) +		seq_printf(m, ":unlimited\n"); +	else +		seq_printf(m, ":count=%ld\n", data->count); + +	return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	data->ref++; +	return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, +		  void **_data) +{ +	struct event_probe_data **pdata = (struct event_probe_data **)_data; +	struct event_probe_data *data = *pdata; + +	if (WARN_ON_ONCE(data->ref <= 0)) +		return; + +	data->ref--; +	if (!data->ref) { +		/* Remove the SOFT_MODE flag */ +		__ftrace_event_enable_disable(data->file, 0, 1); +		module_put(data->file->event_call->mod); +		kfree(data); +	} +	*pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { +	.func			= event_enable_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { +	.func			= event_enable_count_probe, +	.print			= event_enable_print, +	.init			= event_enable_init, +	.free			= event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, +		  char *glob, char *cmd, char *param, int enabled) +{ +	struct trace_array *tr = top_trace_array(); +	struct ftrace_event_file *file; +	struct ftrace_probe_ops *ops; +	struct event_probe_data *data; +	const char *system; +	const char *event; +	char *number; +	bool enable; +	int ret; + +	if (!tr) +		return -ENODEV; + +	/* hash funcs only work with set_ftrace_filter */ +	if (!enabled || !param) +		return -EINVAL; + +	system = strsep(¶m, ":"); +	if (!param) +		return -EINVAL; + +	event = strsep(¶m, ":"); + +	mutex_lock(&event_mutex); + +	ret = -EINVAL; +	file = find_event_file(tr, system, event); +	if (!file) +		goto out; + +	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + +	if (enable) +		ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; +	else +		ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		ret = 0; +		goto out; +	} + +	ret = -ENOMEM; +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out; + +	data->enable = enable; +	data->count = -1; +	data->file = file; + +	if (!param) +		goto out_reg; + +	number = strsep(¶m, ":"); + +	ret = -EINVAL; +	if (!strlen(number)) +		goto out_free; + +	/* +	 * We use the callback data field (which is a pointer) +	 * as our counter. +	 */ +	ret = kstrtoul(number, 0, &data->count); +	if (ret) +		goto out_free; + + out_reg: +	/* Don't let event modules unload while probe registered */ +	ret = try_module_get(file->event_call->mod); +	if (!ret) { +		ret = -EBUSY; +		goto out_free; +	} + +	ret = __ftrace_event_enable_disable(file, 1, 1); +	if (ret < 0) +		goto out_put; +	ret = register_ftrace_function_probe(glob, ops, data); +	/* +	 * The above returns on success the # of functions enabled, +	 * but if it didn't find any functions it returns zero. +	 * Consider no functions a failure too. +	 */ +	if (!ret) { +		ret = -ENOENT; +		goto out_disable; +	} else if (ret < 0) +		goto out_disable; +	/* Just return zero, not the number of enabled functions */ +	ret = 0; + out: +	mutex_unlock(&event_mutex); +	return ret; + + out_disable: +	__ftrace_event_enable_disable(file, 0, 1); + out_put: +	module_put(file->event_call->mod); + out_free: +	kfree(data); +	goto out; +} + +static struct ftrace_func_command event_enable_cmd = { +	.name			= ENABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { +	.name			= DISABLE_EVENT_STR, +	.func			= event_enable_func, +}; + +static __init int register_event_cmds(void) +{ +	int ret; + +	ret = register_ftrace_command(&event_enable_cmd); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = register_ftrace_command(&event_disable_cmd); +	if (WARN_ON(ret < 0)) +		unregister_ftrace_command(&event_enable_cmd); +	return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file; +	int ret; + + +	list_for_each_entry(file, &tr->events, list) { +		ret = event_create_dir(tr->event_dir, file); +		if (ret < 0) +			pr_warning("Could not create directory for event %s\n", +				   ftrace_event_name(file->event_call)); +	} +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ +	struct ftrace_event_call *call; +	int ret; + +	list_for_each_entry(call, &ftrace_events, list) { +		/* Early boot up should not have any modules loaded */ +		if (WARN_ON_ONCE(call->mod)) +			continue; + +		ret = __trace_early_add_new_event(call, tr); +		if (ret < 0) +			pr_warning("Could not create early event %s\n", +				   ftrace_event_name(call)); +	} +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ +	struct ftrace_event_file *file, *next; + +	list_for_each_entry_safe(file, next, &tr->events, list) +		remove_event_file_dir(file); +} + +static void __add_event_to_tracers(struct ftrace_event_call *call) +{ +	struct trace_array *tr; + +	list_for_each_entry(tr, &ftrace_trace_arrays, list) +		__trace_add_new_event(call, tr); +} + +extern struct ftrace_event_call *__start_ftrace_events[]; +extern struct ftrace_event_call *__stop_ftrace_events[];  static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;  static __init int setup_trace_event(char *str)  {  	strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); -	ring_buffer_expanded = 1; -	tracing_selftest_disabled = 1; +	ring_buffer_expanded = true; +	tracing_selftest_disabled = true;  	return 1;  }  __setup("trace_event=", setup_trace_event); -static __init int event_trace_init(void) +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)  { -	struct ftrace_event_call *call; -	struct dentry *d_tracer; -	struct dentry *entry;  	struct dentry *d_events; -	int ret; -	char *buf = bootup_event_buf; -	char *token; - -	d_tracer = tracing_init_dentry(); -	if (!d_tracer) -		return 0; - -	entry = debugfs_create_file("available_events", 0444, d_tracer, -				    (void *)&show_event_seq_ops, -				    &ftrace_avail_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'available_events' entry\n"); +	struct dentry *entry; -	entry = debugfs_create_file("set_event", 0644, d_tracer, -				    (void *)&show_set_event_seq_ops, -				    &ftrace_set_event_fops); -	if (!entry) -		pr_warning("Could not create debugfs " -			   "'set_event' entry\n"); +	entry = debugfs_create_file("set_event", 0644, parent, +				    tr, &ftrace_set_event_fops); +	if (!entry) { +		pr_warning("Could not create debugfs 'set_event' entry\n"); +		return -ENOMEM; +	} -	d_events = event_trace_events_dir(); -	if (!d_events) -		return 0; +	d_events = debugfs_create_dir("events", parent); +	if (!d_events) { +		pr_warning("Could not create debugfs 'events' directory\n"); +		return -ENOMEM; +	}  	/* ring buffer internal formats */  	trace_create_file("header_page", 0444, d_events, @@ -1418,18 +2300,128 @@ static __init int event_trace_init(void)  			  &ftrace_show_header_fops);  	trace_create_file("enable", 0644, d_events, -			  NULL, &ftrace_system_enable_fops); +			  tr, &ftrace_tr_enable_fops); -	if (trace_define_common_fields()) -		pr_warning("tracing: Failed to allocate common fields"); +	tr->event_dir = d_events; + +	return 0; +} -	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { -		__trace_add_event_call(call, NULL, &ftrace_event_id_fops, -				       &ftrace_enable_fops, -				       &ftrace_event_filter_fops, -				       &ftrace_event_format_fops); +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ +	int ret; + +	mutex_lock(&event_mutex); + +	ret = create_event_toplevel_files(parent, tr); +	if (ret) +		goto out_unlock; + +	down_write(&trace_event_sem); +	__trace_early_add_event_dirs(tr); +	up_write(&trace_event_sem); + + out_unlock: +	mutex_unlock(&event_mutex); + +	return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ +	mutex_lock(&event_mutex); + +	/* Disable any event triggers and associated soft-disabled events */ +	clear_event_triggers(tr); + +	/* Disable any running events */ +	__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + +	/* Access to events are within rcu_read_lock_sched() */ +	synchronize_sched(); + +	down_write(&trace_event_sem); +	__trace_remove_event_dirs(tr); +	debugfs_remove_recursive(tr->event_dir); +	up_write(&trace_event_sem); + +	tr->event_dir = NULL; + +	mutex_unlock(&event_mutex); + +	return 0; +} + +static __init int event_trace_memsetup(void) +{ +	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); +	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); +	return 0; +} + +static __init int event_trace_enable(void) +{ +	struct trace_array *tr = top_trace_array(); +	struct ftrace_event_call **iter, *call; +	char *buf = bootup_event_buf; +	char *token; +	int ret; + +	if (!tr) +		return -ENODEV; + +	for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + +		call = *iter; +		ret = event_init(call); +		if (!ret) +			list_add(&call->list, &ftrace_events);  	} +	/* +	 * We need the top trace array to have a working set of trace +	 * points at early init, before the debug files and directories +	 * are created. Create the file entries now, and attach them +	 * to the actual file dentries later. +	 */ +	__trace_early_add_events(tr); +  	while (true) {  		token = strsep(&buf, ","); @@ -1438,17 +2430,57 @@ static __init int event_trace_init(void)  		if (!*token)  			continue; -		ret = ftrace_set_clr_event(token, 1); +		ret = ftrace_set_clr_event(tr, token, 1);  		if (ret) -			pr_warning("Failed to enable trace event: %s\n", token); +			pr_warn("Failed to enable trace event: %s\n", token);  	} +	trace_printk_start_comm(); + +	register_event_cmds(); + +	register_trigger_cmds(); + +	return 0; +} + +static __init int event_trace_init(void) +{ +	struct trace_array *tr; +	struct dentry *d_tracer; +	struct dentry *entry; +	int ret; + +	tr = top_trace_array(); +	if (!tr) +		return -ENODEV; + +	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; + +	entry = debugfs_create_file("available_events", 0444, d_tracer, +				    tr, &ftrace_avail_fops); +	if (!entry) +		pr_warning("Could not create debugfs " +			   "'available_events' entry\n"); + +	if (trace_define_common_fields()) +		pr_warning("tracing: Failed to allocate common fields"); + +	ret = early_event_add_tracer(d_tracer, tr); +	if (ret) +		return ret; + +#ifdef CONFIG_MODULES  	ret = register_module_notifier(&trace_module_nb);  	if (ret)  		pr_warning("Failed to register trace events module notifier\n"); - +#endif  	return 0;  } +early_initcall(event_trace_memsetup); +core_initcall(event_trace_enable);  fs_initcall(event_trace_init);  #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1507,13 +2539,22 @@ static __init void event_test_stuff(void)   */  static __init void event_trace_self_tests(void)  { +	struct ftrace_subsystem_dir *dir; +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	struct event_subsystem *system; +	struct trace_array *tr;  	int ret; +	tr = top_trace_array(); +	if (!tr) +		return; +  	pr_info("Running tests on trace events:\n"); -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { + +		call = file->event_call;  		/* Only test those that have a probe */  		if (!call->class || !call->class->probe) @@ -1531,21 +2572,21 @@ static __init void event_trace_self_tests(void)  			continue;  #endif -		pr_info("Testing event %s: ", call->name); +		pr_info("Testing event %s: ", ftrace_event_name(call));  		/*  		 * If an event is already enabled, someone is using  		 * it and the self test should not be on.  		 */ -		if (call->flags & TRACE_EVENT_FL_ENABLED) { +		if (file->flags & FTRACE_EVENT_FL_ENABLED) {  			pr_warning("Enabled event during self test!\n");  			WARN_ON_ONCE(1);  			continue;  		} -		ftrace_event_enable_disable(call, 1); +		ftrace_event_enable_disable(file, 1);  		event_test_stuff(); -		ftrace_event_enable_disable(call, 0); +		ftrace_event_enable_disable(file, 0);  		pr_cont("OK\n");  	} @@ -1554,7 +2595,9 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on trace event systems:\n"); -	list_for_each_entry(system, &event_subsystems, list) { +	list_for_each_entry(dir, &tr->systems, list) { + +		system = dir->subsystem;  		/* the ftrace system is special, skip it */  		if (strcmp(system->name, "ftrace") == 0) @@ -1562,7 +2605,7 @@ static __init void event_trace_self_tests(void)  		pr_info("Testing event system %s: ", system->name); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);  		if (WARN_ON_ONCE(ret)) {  			pr_warning("error enabling system %s\n",  				   system->name); @@ -1571,10 +2614,12 @@ static __init void event_trace_self_tests(void)  		event_test_stuff(); -		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); -		if (WARN_ON_ONCE(ret)) +		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); +		if (WARN_ON_ONCE(ret)) {  			pr_warning("error disabling system %s\n",  				   system->name); +			continue; +		}  		pr_cont("OK\n");  	} @@ -1584,7 +2629,7 @@ static __init void event_trace_self_tests(void)  	pr_info("Running tests on all trace events:\n");  	pr_info("Testing all events: "); -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error enabling all events\n");  		return; @@ -1593,7 +2638,7 @@ static __init void event_trace_self_tests(void)  	event_test_stuff();  	/* reset sysname */ -	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); +	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);  	if (WARN_ON_ONCE(ret)) {  		pr_warning("error disabling all events\n");  		return; @@ -1607,7 +2652,8 @@ static __init void event_trace_self_tests(void)  static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);  static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) +function_test_events_call(unsigned long ip, unsigned long parent_ip, +			  struct ftrace_ops *op, struct pt_regs *pt_regs)  {  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; @@ -1636,7 +2682,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)  	entry->ip			= ip;  	entry->parent_ip		= parent_ip; -	trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); +	trace_buffer_unlock_commit(buffer, event, flags, pc);   out:  	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1646,11 +2692,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)  static struct ftrace_ops trace_ops __initdata  =  {  	.func = function_test_events_call, +	.flags = FTRACE_OPS_FL_RECURSION_SAFE,  };  static __init void event_trace_self_test_with_function(void)  { -	register_ftrace_function(&trace_ops); +	int ret; +	ret = register_ftrace_function(&trace_ops); +	if (WARN_ON(ret < 0)) { +		pr_info("Failed to enable function tracer for event tests\n"); +		return; +	}  	pr_info("Running tests again, along with the function tracer\n");  	event_trace_self_tests();  	unregister_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17..8a8631926a0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,6 +27,12 @@  #include "trace.h"  #include "trace_output.h" +#define DEFAULT_SYS_FILTER_MESSAGE					\ +	"### global filter ###\n"					\ +	"# Use this to set filters for multiple events.\n"		\ +	"# Only events with the given fields will be affected.\n"	\ +	"# If no events are modified, an error message will be displayed here" +  enum filter_op_ids  {  	OP_OR, @@ -38,6 +44,7 @@ enum filter_op_ids  	OP_LE,  	OP_GT,  	OP_GE, +	OP_BAND,  	OP_NONE,  	OP_OPEN_PAREN,  }; @@ -48,6 +55,7 @@ struct filter_op {  	int precedence;  }; +/* Order must be the same as enum filter_op_ids above */  static struct filter_op filter_ops[] = {  	{ OP_OR,	"||",		1 },  	{ OP_AND,	"&&",		2 }, @@ -58,6 +66,7 @@ static struct filter_op filter_ops[] = {  	{ OP_LE,	"<=",		5 },  	{ OP_GT,	">",		5 },  	{ OP_GE,	">=",		5 }, +	{ OP_BAND,	"&",		6 },  	{ OP_NONE,	"OP_NONE",	0 },  	{ OP_OPEN_PAREN, "(",		0 },  }; @@ -75,6 +84,7 @@ enum {  	FILT_ERR_TOO_MANY_PREDS,  	FILT_ERR_MISSING_FIELD,  	FILT_ERR_INVALID_FILTER, +	FILT_ERR_IP_FIELD_ONLY,  };  static char *err_text[] = { @@ -90,6 +100,7 @@ static char *err_text[] = {  	"Too many terms in predicate expression",  	"Missing field name and/or value",  	"Meaningless filter expression", +	"Only 'ip' field is supported for function trace",  };  struct opstack_op { @@ -123,9 +134,13 @@ struct filter_parse_state {  	} operand;  }; +struct pred_stack { +	struct filter_pred	**preds; +	int			index; +}; +  #define DEFINE_COMPARISON_PRED(type)					\ -static int filter_pred_##type(struct filter_pred *pred, void *event,	\ -			      int val1, int val2)			\ +static int filter_pred_##type(struct filter_pred *pred, void *event)	\  {									\  	type *addr = (type *)(event + pred->offset);			\  	type val = (type)pred->val;					\ @@ -144,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event,	\  	case OP_GE:							\  		match = (*addr >= val);					\  		break;							\ +	case OP_BAND:							\ +		match = (*addr & val);					\ +		break;							\  	default:							\  		break;							\  	}								\ @@ -152,8 +170,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event,	\  }  #define DEFINE_EQUALITY_PRED(size)					\ -static int filter_pred_##size(struct filter_pred *pred, void *event,	\ -			      int val1, int val2)			\ +static int filter_pred_##size(struct filter_pred *pred, void *event)	\  {									\  	u##size *addr = (u##size *)(event + pred->offset);		\  	u##size val = (u##size)pred->val;				\ @@ -178,23 +195,8 @@ DEFINE_EQUALITY_PRED(32);  DEFINE_EQUALITY_PRED(16);  DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), -			   void *event __attribute((unused)), -			   int val1, int val2) -{ -	return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), -			  void *event __attribute((unused)), -			  int val1, int val2) -{ -	return val1 || val2; -} -  /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, -			      int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event)  {  	char *addr = (char *)(event + pred->offset);  	int cmp, match; @@ -207,8 +209,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,  }  /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, -			     int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event)  {  	char **addr = (char **)(event + pred->offset);  	int cmp, match; @@ -231,8 +232,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,   * and add it to the address of the entry, and at last we have   * the address of the string.   */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, -			      int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event)  {  	u32 str_item = *(u32 *)(event + pred->offset);  	int str_loc = str_item & 0xffff; @@ -247,8 +247,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,  	return match;  } -static int filter_pred_none(struct filter_pred *pred, void *event, -			    int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event)  {  	return 0;  } @@ -377,32 +376,196 @@ static void filter_build_regex(struct filter_pred *pred)  	pred->not ^= not;  } -/* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct event_filter *filter, void *rec) +enum move_type { +	MOVE_DOWN, +	MOVE_UP_FROM_LEFT, +	MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, +		int index, enum move_type *move)  { -	int match, top = 0, val1 = 0, val2 = 0; -	int stack[MAX_FILTER_PRED]; -	struct filter_pred *pred; -	int i; +	if (pred->parent & FILTER_PRED_IS_RIGHT) +		*move = MOVE_UP_FROM_RIGHT; +	else +		*move = MOVE_UP_FROM_LEFT; +	pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; -	for (i = 0; i < filter->n_preds; i++) { -		pred = filter->preds[i]; -		if (!pred->pop_n) { -			match = pred->fn(pred, rec, val1, val2); -			stack[top++] = match; +	return pred; +} + +enum walk_return { +	WALK_PRED_ABORT, +	WALK_PRED_PARENT, +	WALK_PRED_DEFAULT, +}; + +typedef int (*filter_pred_walkcb_t) (enum move_type move, +				     struct filter_pred *pred, +				     int *err, void *data); + +static int walk_pred_tree(struct filter_pred *preds, +			  struct filter_pred *root, +			  filter_pred_walkcb_t cb, void *data) +{ +	struct filter_pred *pred = root; +	enum move_type move = MOVE_DOWN; +	int done = 0; + +	if  (!preds) +		return -EINVAL; + +	do { +		int err = 0, ret; + +		ret = cb(move, pred, &err, data); +		if (ret == WALK_PRED_ABORT) +			return err; +		if (ret == WALK_PRED_PARENT) +			goto get_parent; + +		switch (move) { +		case MOVE_DOWN: +			if (pred->left != FILTER_PRED_INVALID) { +				pred = &preds[pred->left]; +				continue; +			} +			goto get_parent; +		case MOVE_UP_FROM_LEFT: +			pred = &preds[pred->right]; +			move = MOVE_DOWN; +			continue; +		case MOVE_UP_FROM_RIGHT: + get_parent: +			if (pred == root) +				break; +			pred = get_pred_parent(pred, preds, +					       pred->parent, +					       &move);  			continue;  		} -		if (pred->pop_n > top) { -			WARN_ON_ONCE(1); -			return 0; +		done = 1; +	} while (!done); + +	/* We are fine. */ +	return 0; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, +		       struct filter_pred *op, void *rec) +{ +	struct filter_pred *pred; +	int match = 0; +	int type; +	int i; + +	/* +	 * Micro-optimization: We set type to true if op +	 * is an OR and false otherwise (AND). Then we +	 * just need to test if the match is equal to +	 * the type, and if it is, we can short circuit the +	 * rest of the checks: +	 * +	 * if ((match && op->op == OP_OR) || +	 *     (!match && op->op == OP_AND)) +	 *	  return match; +	 */ +	type = op->op == OP_OR; + +	for (i = 0; i < op->val; i++) { +		pred = &preds[op->ops[i]]; +		if (!WARN_ON_ONCE(!pred->fn)) +			match = pred->fn(pred, rec); +		if (!!match == type) +			return match; +	} +	return match; +} + +struct filter_match_preds_data { +	struct filter_pred *preds; +	int match; +	void *rec; +}; + +static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, +				 int *err, void *data) +{ +	struct filter_match_preds_data *d = data; + +	*err = 0; +	switch (move) { +	case MOVE_DOWN: +		/* only AND and OR have children */ +		if (pred->left != FILTER_PRED_INVALID) { +			/* If ops is set, then it was folded. */ +			if (!pred->ops) +				return WALK_PRED_DEFAULT; +			/* We can treat folded ops as a leaf node */ +			d->match = process_ops(d->preds, pred, d->rec); +		} else { +			if (!WARN_ON_ONCE(!pred->fn)) +				d->match = pred->fn(pred, d->rec);  		} -		val1 = stack[--top]; -		val2 = stack[--top]; -		match = pred->fn(pred, rec, val1, val2); -		stack[top++] = match; + +		return WALK_PRED_PARENT; +	case MOVE_UP_FROM_LEFT: +		/* +		 * Check for short circuits. +		 * +		 * Optimization: !!match == (pred->op == OP_OR) +		 *   is the same as: +		 * if ((match && pred->op == OP_OR) || +		 *     (!match && pred->op == OP_AND)) +		 */ +		if (!!d->match == (pred->op == OP_OR)) +			return WALK_PRED_PARENT; +		break; +	case MOVE_UP_FROM_RIGHT: +		break;  	} -	return stack[--top]; +	return WALK_PRED_DEFAULT; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct event_filter *filter, void *rec) +{ +	struct filter_pred *preds; +	struct filter_pred *root; +	struct filter_match_preds_data data = { +		/* match is currently meaningless */ +		.match = -1, +		.rec   = rec, +	}; +	int n_preds, ret; + +	/* no filter is considered a match */ +	if (!filter) +		return 1; + +	n_preds = filter->n_preds; +	if (!n_preds) +		return 1; + +	/* +	 * n_preds, root and filter->preds are protect with preemption disabled. +	 */ +	root = rcu_dereference_sched(filter->root); +	if (!root) +		return 1; + +	data.preds = preds = rcu_dereference_sched(filter->preds); +	ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); +	WARN_ON(ret); +	return data.match;  }  EXPORT_SYMBOL_GPL(filter_match_preds); @@ -414,6 +577,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)  static void remove_filter_string(struct event_filter *filter)  { +	if (!filter) +		return; +  	kfree(filter->filter_string);  	filter->filter_string = NULL;  } @@ -471,214 +637,297 @@ static void append_filter_err(struct filter_parse_state *ps,  	free_page((unsigned long) buf);  } -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +static inline struct event_filter *event_filter(struct ftrace_event_file *file)  { -	struct event_filter *filter = call->filter; +	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		return file->event_call->filter; +	else +		return file->filter; +} + +/* caller must hold event_mutex */ +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) +{ +	struct event_filter *filter = event_filter(file); -	mutex_lock(&event_mutex);  	if (filter && filter->filter_string)  		trace_seq_printf(s, "%s\n", filter->filter_string);  	else -		trace_seq_printf(s, "none\n"); -	mutex_unlock(&event_mutex); +		trace_seq_puts(s, "none\n");  }  void print_subsystem_event_filter(struct event_subsystem *system,  				  struct trace_seq *s)  { -	struct event_filter *filter = system->filter; +	struct event_filter *filter;  	mutex_lock(&event_mutex); +	filter = system->filter;  	if (filter && filter->filter_string)  		trace_seq_printf(s, "%s\n", filter->filter_string);  	else -		trace_seq_printf(s, "none\n"); +		trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");  	mutex_unlock(&event_mutex);  } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)  { -	struct ftrace_event_field *field; - -	list_for_each_entry(field, head, link) { -		if (!strcmp(field->name, name)) -			return field; -	} +	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); +	if (!stack->preds) +		return -ENOMEM; +	stack->index = n_preds; +	return 0; +} -	return NULL; +static void __free_pred_stack(struct pred_stack *stack) +{ +	kfree(stack->preds); +	stack->index = 0;  } -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) +static int __push_pred_stack(struct pred_stack *stack, +			     struct filter_pred *pred)  { -	struct ftrace_event_field *field; -	struct list_head *head; +	int index = stack->index; -	field = __find_event_field(&ftrace_common_fields, name); -	if (field) -		return field; +	if (WARN_ON(index == 0)) +		return -ENOSPC; -	head = trace_get_fields(call); -	return __find_event_field(head, name); +	stack->preds[--index] = pred; +	stack->index = index; +	return 0;  } -static void filter_free_pred(struct filter_pred *pred) +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack)  { +	struct filter_pred *pred; +	int index = stack->index; + +	pred = stack->preds[index++];  	if (!pred) -		return; +		return NULL; -	kfree(pred->field_name); -	kfree(pred); +	stack->index = index; +	return pred;  } -static void filter_clear_pred(struct filter_pred *pred) +static int filter_set_pred(struct event_filter *filter, +			   int idx, +			   struct pred_stack *stack, +			   struct filter_pred *src)  { -	kfree(pred->field_name); -	pred->field_name = NULL; -	pred->regex.len = 0; -} +	struct filter_pred *dest = &filter->preds[idx]; +	struct filter_pred *left; +	struct filter_pred *right; -static int filter_set_pred(struct filter_pred *dest, -			   struct filter_pred *src, -			   filter_pred_fn_t fn) -{  	*dest = *src; -	if (src->field_name) { -		dest->field_name = kstrdup(src->field_name, GFP_KERNEL); -		if (!dest->field_name) -			return -ENOMEM; +	dest->index = idx; + +	if (dest->op == OP_OR || dest->op == OP_AND) { +		right = __pop_pred_stack(stack); +		left = __pop_pred_stack(stack); +		if (!left || !right) +			return -EINVAL; +		/* +		 * If both children can be folded +		 * and they are the same op as this op or a leaf, +		 * then this op can be folded. +		 */ +		if (left->index & FILTER_PRED_FOLD && +		    (left->op == dest->op || +		     left->left == FILTER_PRED_INVALID) && +		    right->index & FILTER_PRED_FOLD && +		    (right->op == dest->op || +		     right->left == FILTER_PRED_INVALID)) +			dest->index |= FILTER_PRED_FOLD; + +		dest->left = left->index & ~FILTER_PRED_FOLD; +		dest->right = right->index & ~FILTER_PRED_FOLD; +		left->parent = dest->index & ~FILTER_PRED_FOLD; +		right->parent = dest->index | FILTER_PRED_IS_RIGHT; +	} else { +		/* +		 * Make dest->left invalid to be used as a quick +		 * way to know this is a leaf node. +		 */ +		dest->left = FILTER_PRED_INVALID; + +		/* All leafs allow folding the parent ops. */ +		dest->index |= FILTER_PRED_FOLD;  	} -	dest->fn = fn; -	return 0; +	return __push_pred_stack(stack, dest);  } -static void filter_disable_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter)  { -	struct event_filter *filter = call->filter;  	int i; -	call->flags &= ~TRACE_EVENT_FL_FILTERED; +	if (filter->preds) { +		for (i = 0; i < filter->n_preds; i++) +			kfree(filter->preds[i].ops); +		kfree(filter->preds); +		filter->preds = NULL; +	} +	filter->a_preds = 0;  	filter->n_preds = 0; +} -	for (i = 0; i < MAX_FILTER_PRED; i++) -		filter->preds[i]->fn = filter_pred_none; +static void call_filter_disable(struct ftrace_event_call *call) +{ +	call->flags &= ~TRACE_EVENT_FL_FILTERED;  } -static void __free_preds(struct event_filter *filter) +static void filter_disable(struct ftrace_event_file *file)  { -	int i; +	struct ftrace_event_call *call = file->event_call; +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call_filter_disable(call); +	else +		file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} + +static void __free_filter(struct event_filter *filter) +{  	if (!filter)  		return; -	for (i = 0; i < MAX_FILTER_PRED; i++) { -		if (filter->preds[i]) -			filter_free_pred(filter->preds[i]); -	} -	kfree(filter->preds); +	__free_preds(filter);  	kfree(filter->filter_string);  	kfree(filter);  } -void destroy_preds(struct ftrace_event_call *call) +void free_event_filter(struct event_filter *filter) +{ +	__free_filter(filter); +} + +void destroy_call_preds(struct ftrace_event_call *call)  { -	__free_preds(call->filter); +	__free_filter(call->filter);  	call->filter = NULL; -	call->flags &= ~TRACE_EVENT_FL_FILTERED;  } -static struct event_filter *__alloc_preds(void) +static void destroy_file_preds(struct ftrace_event_file *file) +{ +	__free_filter(file->filter); +	file->filter = NULL; +} + +/* + * Called when destroying the ftrace_event_file. + * The file is being freed, so we do not need to worry about + * the file being currently used. This is for module code removing + * the tracepoints from within it. + */ +void destroy_preds(struct ftrace_event_file *file) +{ +	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		destroy_call_preds(file->event_call); +	else +		destroy_file_preds(file); +} + +static struct event_filter *__alloc_filter(void)  {  	struct event_filter *filter; + +	filter = kzalloc(sizeof(*filter), GFP_KERNEL); +	return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{  	struct filter_pred *pred;  	int i; -	filter = kzalloc(sizeof(*filter), GFP_KERNEL); -	if (!filter) -		return ERR_PTR(-ENOMEM); +	if (filter->preds) +		__free_preds(filter); -	filter->n_preds = 0; +	filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); -	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);  	if (!filter->preds) -		goto oom; +		return -ENOMEM; -	for (i = 0; i < MAX_FILTER_PRED; i++) { -		pred = kzalloc(sizeof(*pred), GFP_KERNEL); -		if (!pred) -			goto oom; +	filter->a_preds = n_preds; +	filter->n_preds = 0; + +	for (i = 0; i < n_preds; i++) { +		pred = &filter->preds[i];  		pred->fn = filter_pred_none; -		filter->preds[i] = pred;  	} -	return filter; - -oom: -	__free_preds(filter); -	return ERR_PTR(-ENOMEM); +	return 0;  } -static int init_preds(struct ftrace_event_call *call) +static inline void __remove_filter(struct ftrace_event_file *file)  { -	if (call->filter) -		return 0; - -	call->flags &= ~TRACE_EVENT_FL_FILTERED; -	call->filter = __alloc_preds(); -	if (IS_ERR(call->filter)) -		return PTR_ERR(call->filter); +	struct ftrace_event_call *call = file->event_call; -	return 0; +	filter_disable(file); +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		remove_filter_string(call->filter); +	else +		remove_filter_string(file->filter);  } -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system, +					struct trace_array *tr)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call; -	int err; -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (strcmp(call->class->system, system->name) != 0)  			continue; -		err = init_preds(call); -		if (err) -			return err; +		__remove_filter(file);  	} +} -	return 0; +static inline void __free_subsystem_filter(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { +		__free_filter(call->filter); +		call->filter = NULL; +	} else { +		__free_filter(file->filter); +		file->filter = NULL; +	}  } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system, +					  struct trace_array *tr)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call; -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (strcmp(call->class->system, system->name) != 0)  			continue; - -		filter_disable_preds(call); -		remove_filter_string(call->filter); +		__free_subsystem_filter(file);  	}  } -static int filter_add_pred_fn(struct filter_parse_state *ps, -			      struct ftrace_event_call *call, -			      struct event_filter *filter, -			      struct filter_pred *pred, -			      filter_pred_fn_t fn) +static int filter_add_pred(struct filter_parse_state *ps, +			   struct event_filter *filter, +			   struct filter_pred *pred, +			   struct pred_stack *stack)  { -	int idx, err; +	int err; -	if (filter->n_preds == MAX_FILTER_PRED) { +	if (WARN_ON(filter->n_preds == filter->a_preds)) {  		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);  		return -ENOSPC;  	} -	idx = filter->n_preds; -	filter_clear_pred(filter->preds[idx]); -	err = filter_set_pred(filter->preds[idx], pred, fn); +	err = filter_set_pred(filter, filter->n_preds, stack, pred);  	if (err)  		return err; @@ -698,6 +947,11 @@ int filter_assign_type(const char *type)  	return FILTER_OTHER;  } +static bool is_function_field(struct ftrace_event_field *field) +{ +	return field->filter_type == FILTER_TRACE_FN; +} +  static bool is_string_field(struct ftrace_event_field *field)  {  	return field->filter_type == FILTER_DYN_STRING || @@ -759,35 +1013,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,  	return fn;  } -static int filter_add_pred(struct filter_parse_state *ps, -			   struct ftrace_event_call *call, -			   struct event_filter *filter, -			   struct filter_pred *pred, -			   bool dry_run) +static int init_pred(struct filter_parse_state *ps, +		     struct ftrace_event_field *field, +		     struct filter_pred *pred) +  { -	struct ftrace_event_field *field; -	filter_pred_fn_t fn; +	filter_pred_fn_t fn = filter_pred_none;  	unsigned long long val;  	int ret; -	pred->fn = filter_pred_none; - -	if (pred->op == OP_AND) { -		pred->pop_n = 2; -		fn = filter_pred_and; -		goto add_pred_fn; -	} else if (pred->op == OP_OR) { -		pred->pop_n = 2; -		fn = filter_pred_or; -		goto add_pred_fn; -	} - -	field = find_event_field(call, pred->field_name); -	if (!field) { -		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); -		return -EINVAL; -	} -  	pred->offset = field->offset;  	if (!is_legal_op(field, pred->op)) { @@ -805,11 +1039,16 @@ static int filter_add_pred(struct filter_parse_state *ps,  			fn = filter_pred_strloc;  		else  			fn = filter_pred_pchar; +	} else if (is_function_field(field)) { +		if (strcmp(field->name, "ip")) { +			parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); +			return -EINVAL; +		}  	} else {  		if (field->is_signed) -			ret = strict_strtoll(pred->regex.pattern, 0, &val); +			ret = kstrtoll(pred->regex.pattern, 0, &val);  		else -			ret = strict_strtoull(pred->regex.pattern, 0, &val); +			ret = kstrtoull(pred->regex.pattern, 0, &val);  		if (ret) {  			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);  			return -EINVAL; @@ -827,9 +1066,7 @@ static int filter_add_pred(struct filter_parse_state *ps,  	if (pred->op == OP_NE)  		pred->not = 1; -add_pred_fn: -	if (!dry_run) -		return filter_add_pred_fn(ps, call, filter, pred, fn); +	pred->fn = fn;  	return 0;  } @@ -1128,39 +1365,34 @@ parse_operand:  	return 0;  } -static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +static struct filter_pred *create_pred(struct filter_parse_state *ps, +				       struct ftrace_event_call *call, +				       int op, char *operand1, char *operand2)  { -	struct filter_pred *pred; +	struct ftrace_event_field *field; +	static struct filter_pred pred; -	pred = kzalloc(sizeof(*pred), GFP_KERNEL); -	if (!pred) -		return NULL; +	memset(&pred, 0, sizeof(pred)); +	pred.op = op; -	pred->field_name = kstrdup(operand1, GFP_KERNEL); -	if (!pred->field_name) { -		kfree(pred); +	if (op == OP_AND || op == OP_OR) +		return &pred; + +	if (!operand1 || !operand2) { +		parse_error(ps, FILT_ERR_MISSING_FIELD, 0);  		return NULL;  	} -	strcpy(pred->regex.pattern, operand2); -	pred->regex.len = strlen(pred->regex.pattern); - -	pred->op = op; - -	return pred; -} - -static struct filter_pred *create_logical_pred(int op) -{ -	struct filter_pred *pred; - -	pred = kzalloc(sizeof(*pred), GFP_KERNEL); -	if (!pred) +	field = trace_find_event_field(call, operand1); +	if (!field) { +		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);  		return NULL; +	} -	pred->op = op; - -	return pred; +	strcpy(pred.regex.pattern, operand2); +	pred.regex.len = strlen(pred.regex.pattern); +	pred.field = field; +	return init_pred(ps, field, &pred) ? NULL : &pred;  }  static int check_preds(struct filter_parse_state *ps) @@ -1187,6 +1419,166 @@ static int check_preds(struct filter_parse_state *ps)  	return 0;  } +static int count_preds(struct filter_parse_state *ps) +{ +	struct postfix_elt *elt; +	int n_preds = 0; + +	list_for_each_entry(elt, &ps->postfix, list) { +		if (elt->op == OP_NONE) +			continue; +		n_preds++; +	} + +	return n_preds; +} + +struct check_pred_data { +	int count; +	int max; +}; + +static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, +			      int *err, void *data) +{ +	struct check_pred_data *d = data; + +	if (WARN_ON(d->count++ > d->max)) { +		*err = -EINVAL; +		return WALK_PRED_ABORT; +	} +	return WALK_PRED_DEFAULT; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, +			   struct filter_pred *root) +{ +	struct check_pred_data data = { +		/* +		 * The max that we can hit a node is three times. +		 * Once going down, once coming up from left, and +		 * once coming up from right. This is more than enough +		 * since leafs are only hit a single time. +		 */ +		.max   = 3 * filter->n_preds, +		.count = 0, +	}; + +	return walk_pred_tree(filter->preds, root, +			      check_pred_tree_cb, &data); +} + +static int count_leafs_cb(enum move_type move, struct filter_pred *pred, +			  int *err, void *data) +{ +	int *count = data; + +	if ((move == MOVE_DOWN) && +	    (pred->left == FILTER_PRED_INVALID)) +		(*count)++; + +	return WALK_PRED_DEFAULT; +} + +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ +	int count = 0, ret; + +	ret = walk_pred_tree(preds, root, count_leafs_cb, &count); +	WARN_ON(ret); +	return count; +} + +struct fold_pred_data { +	struct filter_pred *root; +	int count; +	int children; +}; + +static int fold_pred_cb(enum move_type move, struct filter_pred *pred, +			int *err, void *data) +{ +	struct fold_pred_data *d = data; +	struct filter_pred *root = d->root; + +	if (move != MOVE_DOWN) +		return WALK_PRED_DEFAULT; +	if (pred->left != FILTER_PRED_INVALID) +		return WALK_PRED_DEFAULT; + +	if (WARN_ON(d->count == d->children)) { +		*err = -EINVAL; +		return WALK_PRED_ABORT; +	} + +	pred->index &= ~FILTER_PRED_FOLD; +	root->ops[d->count++] = pred->index; +	return WALK_PRED_DEFAULT; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ +	struct fold_pred_data data = { +		.root  = root, +		.count = 0, +	}; +	int children; + +	/* No need to keep the fold flag */ +	root->index &= ~FILTER_PRED_FOLD; + +	/* If the root is a leaf then do nothing */ +	if (root->left == FILTER_PRED_INVALID) +		return 0; + +	/* count the children */ +	children = count_leafs(preds, &preds[root->left]); +	children += count_leafs(preds, &preds[root->right]); + +	root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); +	if (!root->ops) +		return -ENOMEM; + +	root->val = children; +	data.children = children; +	return walk_pred_tree(preds, root, fold_pred_cb, &data); +} + +static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, +			     int *err, void *data) +{ +	struct filter_pred *preds = data; + +	if (move != MOVE_DOWN) +		return WALK_PRED_DEFAULT; +	if (!(pred->index & FILTER_PRED_FOLD)) +		return WALK_PRED_DEFAULT; + +	*err = fold_pred(preds, pred); +	if (*err) +		return WALK_PRED_ABORT; + +	/* eveyrhing below is folded, continue with parent */ +	return WALK_PRED_PARENT; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, +			   struct filter_pred *root) +{ +	return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, +			      filter->preds); +} +  static int replace_preds(struct ftrace_event_call *call,  			 struct event_filter *filter,  			 struct filter_parse_state *ps, @@ -1195,14 +1587,32 @@ static int replace_preds(struct ftrace_event_call *call,  {  	char *operand1 = NULL, *operand2 = NULL;  	struct filter_pred *pred; +	struct filter_pred *root;  	struct postfix_elt *elt; +	struct pred_stack stack = { }; /* init to NULL */  	int err;  	int n_preds = 0; +	n_preds = count_preds(ps); +	if (n_preds >= MAX_FILTER_PRED) { +		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); +		return -ENOSPC; +	} +  	err = check_preds(ps);  	if (err)  		return err; +	if (!dry_run) { +		err = __alloc_pred_stack(&stack, n_preds); +		if (err) +			return err; +		err = __alloc_preds(filter, n_preds); +		if (err) +			goto fail; +	} + +	n_preds = 0;  	list_for_each_entry(elt, &ps->postfix, list) {  		if (elt->op == OP_NONE) {  			if (!operand1) @@ -1211,166 +1621,458 @@ static int replace_preds(struct ftrace_event_call *call,  				operand2 = elt->operand;  			else {  				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); -				return -EINVAL; +				err = -EINVAL; +				goto fail;  			}  			continue;  		} -		if (n_preds++ == MAX_FILTER_PRED) { +		if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {  			parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); -			return -ENOSPC; +			err = -ENOSPC; +			goto fail;  		} -		if (elt->op == OP_AND || elt->op == OP_OR) { -			pred = create_logical_pred(elt->op); -			goto add_pred; +		pred = create_pred(ps, call, elt->op, operand1, operand2); +		if (!pred) { +			err = -EINVAL; +			goto fail;  		} -		if (!operand1 || !operand2) { -			parse_error(ps, FILT_ERR_MISSING_FIELD, 0); -			return -EINVAL; +		if (!dry_run) { +			err = filter_add_pred(ps, filter, pred, &stack); +			if (err) +				goto fail;  		} -		pred = create_pred(elt->op, operand1, operand2); -add_pred: +		operand1 = operand2 = NULL; +	} + +	if (!dry_run) { +		/* We should have one item left on the stack */ +		pred = __pop_pred_stack(&stack);  		if (!pred) -			return -ENOMEM; -		err = filter_add_pred(ps, call, filter, pred, dry_run); -		filter_free_pred(pred); +			return -EINVAL; +		/* This item is where we start from in matching */ +		root = pred; +		/* Make sure the stack is empty */ +		pred = __pop_pred_stack(&stack); +		if (WARN_ON(pred)) { +			err = -EINVAL; +			filter->root = NULL; +			goto fail; +		} +		err = check_pred_tree(filter, root);  		if (err) -			return err; +			goto fail; -		operand1 = operand2 = NULL; +		/* Optimize the tree */ +		err = fold_pred_tree(filter, root); +		if (err) +			goto fail; + +		/* We don't set root until we know it works */ +		barrier(); +		filter->root = root;  	} -	return 0; +	err = 0; +fail: +	__free_pred_stack(&stack); +	return err; +} + +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call->flags |= TRACE_EVENT_FL_FILTERED; +	else +		file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, +				    struct event_filter *filter) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		rcu_assign_pointer(call->filter, filter); +	else +		rcu_assign_pointer(file->filter, filter);  } +static inline void event_clear_filter(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		RCU_INIT_POINTER(call->filter, NULL); +	else +		RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; +	else +		file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; +	else +		file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) +		return true; + +	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && +	    (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) +		return true; + +	return false; +} + +struct filter_list { +	struct list_head	list; +	struct event_filter	*filter; +}; +  static int replace_system_preds(struct event_subsystem *system, +				struct trace_array *tr,  				struct filter_parse_state *ps,  				char *filter_string)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call; +	struct filter_list *filter_item; +	struct filter_list *tmp; +	LIST_HEAD(filter_list);  	bool fail = true;  	int err; -	list_for_each_entry(call, &ftrace_events, list) { -		struct event_filter *filter = call->filter; - +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (strcmp(call->class->system, system->name) != 0)  			continue; -		/* try to see if the filter can be applied */ -		err = replace_preds(call, filter, ps, filter_string, true); +		/* +		 * Try to see if the filter can be applied +		 *  (filter arg is ignored on dry_run) +		 */ +		err = replace_preds(call, NULL, ps, filter_string, true);  		if (err) +			event_set_no_set_filter_flag(file); +		else +			event_clear_no_set_filter_flag(file); +	} + +	list_for_each_entry(file, &tr->events, list) { +		struct event_filter *filter; + +		call = file->event_call; + +		if (strcmp(call->class->system, system->name) != 0)  			continue; -		/* really apply the filter */ -		filter_disable_preds(call); -		err = replace_preds(call, filter, ps, filter_string, false); +		if (event_no_set_filter_flag(file)) +			continue; + +		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); +		if (!filter_item) +			goto fail_mem; + +		list_add_tail(&filter_item->list, &filter_list); + +		filter_item->filter = __alloc_filter(); +		if (!filter_item->filter) +			goto fail_mem; +		filter = filter_item->filter; + +		/* Can only fail on no memory */ +		err = replace_filter_string(filter, filter_string);  		if (err) -			filter_disable_preds(call); -		else { -			call->flags |= TRACE_EVENT_FL_FILTERED; -			replace_filter_string(filter, filter_string); -		} +			goto fail_mem; + +		err = replace_preds(call, filter, ps, filter_string, false); +		if (err) { +			filter_disable(file); +			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); +			append_filter_err(ps, filter); +		} else +			event_set_filtered_flag(file); +		/* +		 * Regardless of if this returned an error, we still +		 * replace the filter for the call. +		 */ +		filter = event_filter(file); +		event_set_filter(file, filter_item->filter); +		filter_item->filter = filter; +  		fail = false;  	} -	if (fail) { -		parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); -		return -EINVAL; +	if (fail) +		goto fail; + +	/* +	 * The calls can still be using the old filters. +	 * Do a synchronize_sched() to ensure all calls are +	 * done with them before we free them. +	 */ +	synchronize_sched(); +	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { +		__free_filter(filter_item->filter); +		list_del(&filter_item->list); +		kfree(filter_item);  	}  	return 0; + fail: +	/* No call succeeded */ +	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { +		list_del(&filter_item->list); +		kfree(filter_item); +	} +	parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); +	return -EINVAL; + fail_mem: +	/* If any call succeeded, we still need to sync */ +	if (!fail) +		synchronize_sched(); +	list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { +		__free_filter(filter_item->filter); +		list_del(&filter_item->list); +		kfree(filter_item); +	} +	return -ENOMEM;  } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int create_filter_start(char *filter_str, bool set_str, +			       struct filter_parse_state **psp, +			       struct event_filter **filterp)  { +	struct event_filter *filter; +	struct filter_parse_state *ps = NULL; +	int err = 0; + +	WARN_ON_ONCE(*psp || *filterp); + +	/* allocate everything, and if any fails, free all and fail */ +	filter = __alloc_filter(); +	if (filter && set_str) +		err = replace_filter_string(filter, filter_str); + +	ps = kzalloc(sizeof(*ps), GFP_KERNEL); + +	if (!filter || !ps || err) { +		kfree(ps); +		__free_filter(filter); +		return -ENOMEM; +	} + +	/* we're committed to creating a new filter */ +	*filterp = filter; +	*psp = ps; + +	parse_init(ps, filter_ops, filter_str); +	err = filter_parse(ps); +	if (err && set_str) +		append_filter_err(ps, filter); +	return err; +} + +static void create_filter_finish(struct filter_parse_state *ps) +{ +	if (ps) { +		filter_opstack_clear(ps); +		postfix_clear(ps); +		kfree(ps); +	} +} + +/** + * create_filter - create a filter for a ftrace_event_call + * @call: ftrace_event_call to create a filter for + * @filter_str: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * + * Creates a filter for @call with @filter_str.  If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter.  On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter.  In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct ftrace_event_call *call, +			 char *filter_str, bool set_str, +			 struct event_filter **filterp) +{ +	struct event_filter *filter = NULL; +	struct filter_parse_state *ps = NULL;  	int err; -	struct filter_parse_state *ps; -	mutex_lock(&event_mutex); +	err = create_filter_start(filter_str, set_str, &ps, &filter); +	if (!err) { +		err = replace_preds(call, filter, ps, filter_str, false); +		if (err && set_str) +			append_filter_err(ps, filter); +	} +	create_filter_finish(ps); -	err = init_preds(call); -	if (err) -		goto out_unlock; +	*filterp = filter; +	return err; +} -	if (!strcmp(strstrip(filter_string), "0")) { -		filter_disable_preds(call); -		remove_filter_string(call->filter); -		goto out_unlock; +int create_event_filter(struct ftrace_event_call *call, +			char *filter_str, bool set_str, +			struct event_filter **filterp) +{ +	return create_filter(call, filter_str, set_str, filterp); +} + +/** + * create_system_filter - create a filter for an event_subsystem + * @system: event_subsystem to create a filter for + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct event_subsystem *system, +				struct trace_array *tr, +				char *filter_str, struct event_filter **filterp) +{ +	struct event_filter *filter = NULL; +	struct filter_parse_state *ps = NULL; +	int err; + +	err = create_filter_start(filter_str, true, &ps, &filter); +	if (!err) { +		err = replace_system_preds(system, tr, ps, filter_str); +		if (!err) { +			/* System filters just show a default message */ +			kfree(filter->filter_string); +			filter->filter_string = NULL; +		} else { +			append_filter_err(ps, filter); +		}  	} +	create_filter_finish(ps); -	err = -ENOMEM; -	ps = kzalloc(sizeof(*ps), GFP_KERNEL); -	if (!ps) -		goto out_unlock; +	*filterp = filter; +	return err; +} -	filter_disable_preds(call); -	replace_filter_string(call->filter, filter_string); +/* caller must hold event_mutex */ +int apply_event_filter(struct ftrace_event_file *file, char *filter_string) +{ +	struct ftrace_event_call *call = file->event_call; +	struct event_filter *filter; +	int err; -	parse_init(ps, filter_ops, filter_string); -	err = filter_parse(ps); -	if (err) { -		append_filter_err(ps, call->filter); -		goto out; +	if (!strcmp(strstrip(filter_string), "0")) { +		filter_disable(file); +		filter = event_filter(file); + +		if (!filter) +			return 0; + +		event_clear_filter(file); + +		/* Make sure the filter is not being used */ +		synchronize_sched(); +		__free_filter(filter); + +		return 0;  	} -	err = replace_preds(call, call->filter, ps, filter_string, false); -	if (err) -		append_filter_err(ps, call->filter); -	else -		call->flags |= TRACE_EVENT_FL_FILTERED; -out: -	filter_opstack_clear(ps); -	postfix_clear(ps); -	kfree(ps); -out_unlock: -	mutex_unlock(&event_mutex); +	err = create_filter(call, filter_string, true, &filter); + +	/* +	 * Always swap the call filter with the new filter +	 * even if there was an error. If there was an error +	 * in the filter, we disable the filter and show the error +	 * string +	 */ +	if (filter) { +		struct event_filter *tmp; + +		tmp = event_filter(file); +		if (!err) +			event_set_filtered_flag(file); +		else +			filter_disable(file); + +		event_set_filter(file, filter); + +		if (tmp) { +			/* Make sure the call is done with the filter */ +			synchronize_sched(); +			__free_filter(tmp); +		} +	}  	return err;  } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  				 char *filter_string)  { -	int err; -	struct filter_parse_state *ps; +	struct event_subsystem *system = dir->subsystem; +	struct trace_array *tr = dir->tr; +	struct event_filter *filter; +	int err = 0;  	mutex_lock(&event_mutex); -	err = init_subsystem_preds(system); -	if (err) +	/* Make sure the system still has events */ +	if (!dir->nr_events) { +		err = -ENODEV;  		goto out_unlock; +	}  	if (!strcmp(strstrip(filter_string), "0")) { -		filter_free_subsystem_preds(system); +		filter_free_subsystem_preds(system, tr);  		remove_filter_string(system->filter); +		filter = system->filter; +		system->filter = NULL; +		/* Ensure all filters are no longer used */ +		synchronize_sched(); +		filter_free_subsystem_filters(system, tr); +		__free_filter(filter);  		goto out_unlock;  	} -	err = -ENOMEM; -	ps = kzalloc(sizeof(*ps), GFP_KERNEL); -	if (!ps) -		goto out_unlock; - -	replace_filter_string(system->filter, filter_string); - -	parse_init(ps, filter_ops, filter_string); -	err = filter_parse(ps); -	if (err) { -		append_filter_err(ps, system->filter); -		goto out; +	err = create_system_filter(system, tr, filter_string, &filter); +	if (filter) { +		/* +		 * No event actually uses the system filter +		 * we can free it without synchronize_sched(). +		 */ +		__free_filter(system->filter); +		system->filter = filter;  	} - -	err = replace_system_preds(system, ps, filter_string); -	if (err) -		append_filter_err(ps, system->filter); - -out: -	filter_opstack_clear(ps); -	postfix_clear(ps); -	kfree(ps);  out_unlock:  	mutex_unlock(&event_mutex); @@ -1384,60 +2086,182 @@ void ftrace_profile_free_filter(struct perf_event *event)  	struct event_filter *filter = event->filter;  	event->filter = NULL; -	__free_preds(filter); +	__free_filter(filter); +} + +struct function_filter_data { +	struct ftrace_ops *ops; +	int first_filter; +	int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ +	char *str, *sep, **re; + +	str = kstrndup(buf, len, GFP_KERNEL); +	if (!str) +		return NULL; + +	/* +	 * The argv_split function takes white space +	 * as a separator, so convert ',' into spaces. +	 */ +	while ((sep = strchr(str, ','))) +		*sep = ' '; + +	re = argv_split(GFP_KERNEL, str, count); +	kfree(str); +	return re;  } +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, +				      int reset, char *re, int len) +{ +	int ret; + +	if (filter) +		ret = ftrace_set_filter(ops, re, len, reset); +	else +		ret = ftrace_set_notrace(ops, re, len, reset); + +	return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, +					struct function_filter_data *data) +{ +	int i, re_cnt, ret = -EINVAL; +	int *reset; +	char **re; + +	reset = filter ? &data->first_filter : &data->first_notrace; + +	/* +	 * The 'ip' field could have multiple filters set, separated +	 * either by space or comma. We first cut the filter and apply +	 * all pieces separatelly. +	 */ +	re = ftrace_function_filter_re(buf, len, &re_cnt); +	if (!re) +		return -EINVAL; + +	for (i = 0; i < re_cnt; i++) { +		ret = ftrace_function_set_regexp(data->ops, filter, *reset, +						 re[i], strlen(re[i])); +		if (ret) +			break; + +		if (*reset) +			*reset = 0; +	} + +	argv_free(re); +	return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ +	struct ftrace_event_field *field = pred->field; + +	if (leaf) { +		/* +		 * Check the leaf predicate for function trace, verify: +		 *  - only '==' and '!=' is used +		 *  - the 'ip' field is used +		 */ +		if ((pred->op != OP_EQ) && (pred->op != OP_NE)) +			return -EINVAL; + +		if (strcmp(field->name, "ip")) +			return -EINVAL; +	} else { +		/* +		 * Check the non leaf predicate for function trace, verify: +		 *  - only '||' is used +		*/ +		if (pred->op != OP_OR) +			return -EINVAL; +	} + +	return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, +					 struct filter_pred *pred, +					 int *err, void *data) +{ +	/* Checking the node is valid for function trace. */ +	if ((move != MOVE_DOWN) || +	    (pred->left != FILTER_PRED_INVALID)) { +		*err = ftrace_function_check_pred(pred, 0); +	} else { +		*err = ftrace_function_check_pred(pred, 1); +		if (*err) +			return WALK_PRED_ABORT; + +		*err = __ftrace_function_set_filter(pred->op == OP_EQ, +						    pred->regex.pattern, +						    pred->regex.len, +						    data); +	} + +	return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, +				      struct event_filter *filter) +{ +	struct function_filter_data data = { +		.first_filter  = 1, +		.first_notrace = 1, +		.ops           = &event->ftrace_ops, +	}; + +	return walk_pred_tree(filter->preds, filter->root, +			      ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, +				      struct event_filter *filter) +{ +	return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ +  int ftrace_profile_set_filter(struct perf_event *event, int event_id,  			      char *filter_str)  {  	int err;  	struct event_filter *filter; -	struct filter_parse_state *ps; -	struct ftrace_event_call *call = NULL; +	struct ftrace_event_call *call;  	mutex_lock(&event_mutex); -	list_for_each_entry(call, &ftrace_events, list) { -		if (call->event.type == event_id) -			break; -	} +	call = event->tp_event;  	err = -EINVAL; -	if (&call->list == &ftrace_events) +	if (!call)  		goto out_unlock;  	err = -EEXIST;  	if (event->filter)  		goto out_unlock; -	filter = __alloc_preds(); -	if (IS_ERR(filter)) { -		err = PTR_ERR(filter); -		goto out_unlock; -	} - -	err = -ENOMEM; -	ps = kzalloc(sizeof(*ps), GFP_KERNEL); -	if (!ps) -		goto free_preds; - -	parse_init(ps, filter_ops, filter_str); -	err = filter_parse(ps); +	err = create_filter(call, filter_str, false, &filter);  	if (err) -		goto free_ps; +		goto free_filter; -	err = replace_preds(call, filter, ps, filter_str, false); -	if (!err) +	if (ftrace_event_is_function(call)) +		err = ftrace_function_set_filter(event, filter); +	else  		event->filter = filter; -free_ps: -	filter_opstack_clear(ps); -	postfix_clear(ps); -	kfree(ps); - -free_preds: -	if (err) -		__free_preds(filter); +free_filter: +	if (err || ftrace_event_is_function(call)) +		__free_filter(filter);  out_unlock:  	mutex_unlock(&event_mutex); @@ -1447,3 +2271,179 @@ out_unlock:  #endif /* CONFIG_PERF_EVENTS */ +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include <linux/types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ +	.filter = FILTER, \ +	.rec    = { .a = va, .b = vb, .c = vc, .d = vd, \ +		    .e = ve, .f = vf, .g = vg, .h = vh }, \ +	.match  = m, \ +	.not_visited = nvisit, \ +} +#define YES 1 +#define NO  0 + +static struct test_filter_data_t { +	char *filter; +	struct ftrace_raw_ftrace_test_filter rec; +	int match; +	char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ +	       "e == 1 && f == 1 && g == 1 && h == 1" +	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), +	DATA_REC(NO,  0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), +	DATA_REC(NO,  1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ +	       "e == 1 || f == 1 || g == 1 || h == 1" +	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""), +	DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), +	DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ +	       "(e == 1 || f == 1) && (g == 1 || h == 1)" +	DATA_REC(NO,  0, 0, 1, 1, 1, 1, 1, 1, "dfh"), +	DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), +	DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), +	DATA_REC(NO,  1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ +	       "(e == 1 && f == 1) || (g == 1 && h == 1)" +	DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), +	DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), +	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ +	       "(e == 1 && f == 1) || (g == 1 && h == 1)" +	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), +	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""), +	DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ +	       "(e == 1 || f == 1)) && (g == 1 || h == 1)" +	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), +	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""), +	DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ +	       "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" +	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), +	DATA_REC(NO,  0, 1, 0, 1, 0, 1, 0, 1, ""), +	DATA_REC(NO,  1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ +	       "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" +	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), +	DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), +	DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ +	struct ftrace_event_field *field = pred->field; + +	test_pred_visited = 1; +	printk(KERN_INFO "\npred visited %s\n", field->name); +	return 1; +} + +static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, +			     int *err, void *data) +{ +	char *fields = data; + +	if ((move == MOVE_DOWN) && +	    (pred->left == FILTER_PRED_INVALID)) { +		struct ftrace_event_field *field = pred->field; + +		if (!field) { +			WARN(1, "all leafs should have field defined"); +			return WALK_PRED_DEFAULT; +		} +		if (!strchr(fields, *field->name)) +			return WALK_PRED_DEFAULT; + +		WARN_ON(!pred->fn); +		pred->fn = test_pred_visited_fn; +	} +	return WALK_PRED_DEFAULT; +} + +static __init int ftrace_test_event_filter(void) +{ +	int i; + +	printk(KERN_INFO "Testing ftrace filter: "); + +	for (i = 0; i < DATA_CNT; i++) { +		struct event_filter *filter = NULL; +		struct test_filter_data_t *d = &test_filter_data[i]; +		int err; + +		err = create_filter(&event_ftrace_test_filter, d->filter, +				    false, &filter); +		if (err) { +			printk(KERN_INFO +			       "Failed to get filter for '%s', err %d\n", +			       d->filter, err); +			__free_filter(filter); +			break; +		} + +		/* +		 * The preemption disabling is not really needed for self +		 * tests, but the rcu dereference will complain without it. +		 */ +		preempt_disable(); +		if (*d->not_visited) +			walk_pred_tree(filter->preds, filter->root, +				       test_walk_pred_cb, +				       d->not_visited); + +		test_pred_visited = 0; +		err = filter_match_preds(filter, &d->rec); +		preempt_enable(); + +		__free_filter(filter); + +		if (test_pred_visited) { +			printk(KERN_INFO +			       "Failed, unwanted pred visited for filter %s\n", +			       d->filter); +			break; +		} + +		if (err != d->match) { +			printk(KERN_INFO +			       "Failed to match filter '%s', expected %d\n", +			       d->filter, d->match); +			break; +		} +	} + +	if (i == DATA_CNT) +		printk(KERN_CONT "OK\n"); + +	return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 00000000000..bfd4dba0d60 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(ftrace_test_filter, + +	TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + +	TP_ARGS(a, b, c, d, e, f, g, h), + +	TP_STRUCT__entry( +		__field(int, a) +		__field(int, b) +		__field(int, c) +		__field(int, d) +		__field(int, e) +		__field(int, f) +		__field(int, g) +		__field(int, h) +	), + +	TP_fast_assign( +		__entry->a = a; +		__entry->b = b; +		__entry->c = c; +		__entry->d = d; +		__entry->e = e; +		__entry->f = f; +		__entry->g = g; +		__entry->h = h; +	), + +	TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", +		  __entry->a, __entry->b, __entry->c, __entry->d, +		  __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 00000000000..4747b476a03 --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1437 @@ +/* + * trace_events_trigger - trace event triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +static void +trigger_data_free(struct event_trigger_data *data) +{ +	if (data->cmd_ops->set_filter) +		data->cmd_ops->set_filter(NULL, data, NULL); + +	synchronize_sched(); /* make sure current triggers exit before free */ +	kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The ftrace_event_file associated with the event + * @rec: The trace entry for the event, NULL for unconditional invocation + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command.  If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked.  If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct ftrace_event_file *file, void *rec) +{ +	struct event_trigger_data *data; +	enum event_trigger_type tt = ETT_NONE; +	struct event_filter *filter; + +	if (list_empty(&file->triggers)) +		return tt; + +	list_for_each_entry_rcu(data, &file->triggers, list) { +		if (!rec) { +			data->ops->func(data); +			continue; +		} +		filter = rcu_dereference_sched(data->filter); +		if (filter && !filter_match_preds(filter, rec)) +			continue; +		if (data->cmd_ops->post_trigger) { +			tt |= data->cmd_ops->trigger_type; +			continue; +		} +		data->ops->func(data); +	} +	return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The ftrace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct ftrace_event_file *file, +			 enum event_trigger_type tt) +{ +	struct event_trigger_data *data; + +	list_for_each_entry_rcu(data, &file->triggers, list) { +		if (data->cmd_ops->trigger_type & tt) +			data->ops->func(data); +	} +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS	(void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ +	struct ftrace_event_file *event_file = event_file_data(m->private); + +	if (t == SHOW_AVAILABLE_TRIGGERS) +		return NULL; + +	return seq_list_next(t, &event_file->triggers, pos); +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ +	struct ftrace_event_file *event_file; + +	/* ->stop() is called even if ->start() fails */ +	mutex_lock(&event_mutex); +	event_file = event_file_data(m->private); +	if (unlikely(!event_file)) +		return ERR_PTR(-ENODEV); + +	if (list_empty(&event_file->triggers)) +		return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + +	return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ +	mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ +	struct event_trigger_data *data; +	struct event_command *p; + +	if (v == SHOW_AVAILABLE_TRIGGERS) { +		seq_puts(m, "# Available triggers:\n"); +		seq_putc(m, '#'); +		mutex_lock(&trigger_cmd_mutex); +		list_for_each_entry_reverse(p, &trigger_commands, list) +			seq_printf(m, " %s", p->name); +		seq_putc(m, '\n'); +		mutex_unlock(&trigger_cmd_mutex); +		return 0; +	} + +	data = list_entry(v, struct event_trigger_data, list); +	data->ops->print(m, data->ops, data); + +	return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { +	.start = trigger_start, +	.next = trigger_next, +	.stop = trigger_stop, +	.show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ +	int ret = 0; + +	mutex_lock(&event_mutex); + +	if (unlikely(!event_file_data(file))) { +		mutex_unlock(&event_mutex); +		return -ENODEV; +	} + +	if (file->f_mode & FMODE_READ) { +		ret = seq_open(file, &event_triggers_seq_ops); +		if (!ret) { +			struct seq_file *m = file->private_data; +			m->private = file; +		} +	} + +	mutex_unlock(&event_mutex); + +	return ret; +} + +static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +{ +	char *command, *next = buff; +	struct event_command *p; +	int ret = -EINVAL; + +	command = strsep(&next, ": \t"); +	command = (command[0] != '!') ? command : command + 1; + +	mutex_lock(&trigger_cmd_mutex); +	list_for_each_entry(p, &trigger_commands, list) { +		if (strcmp(p->name, command) == 0) { +			ret = p->func(p, file, buff, command, next); +			goto out_unlock; +		} +	} + out_unlock: +	mutex_unlock(&trigger_cmd_mutex); + +	return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, +					 const char __user *ubuf, +					 size_t cnt, loff_t *ppos) +{ +	struct ftrace_event_file *event_file; +	ssize_t ret; +	char *buf; + +	if (!cnt) +		return 0; + +	if (cnt >= PAGE_SIZE) +		return -EINVAL; + +	buf = (char *)__get_free_page(GFP_TEMPORARY); +	if (!buf) +		return -ENOMEM; + +	if (copy_from_user(buf, ubuf, cnt)) { +		free_page((unsigned long)buf); +		return -EFAULT; +	} +	buf[cnt] = '\0'; +	strim(buf); + +	mutex_lock(&event_mutex); +	event_file = event_file_data(file); +	if (unlikely(!event_file)) { +		mutex_unlock(&event_mutex); +		free_page((unsigned long)buf); +		return -ENODEV; +	} +	ret = trigger_process_regex(event_file, buf); +	mutex_unlock(&event_mutex); + +	free_page((unsigned long)buf); +	if (ret < 0) +		goto out; + +	*ppos += cnt; +	ret = cnt; + out: +	return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ +	mutex_lock(&event_mutex); + +	if (file->f_mode & FMODE_READ) +		seq_release(inode, file); + +	mutex_unlock(&event_mutex); + +	return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, +		    size_t cnt, loff_t *ppos) +{ +	return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ +	return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ +	return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { +	.open = event_trigger_open, +	.read = seq_read, +	.write = event_trigger_write, +	.llseek = tracing_lseek, +	.release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +static __init int register_event_command(struct event_command *cmd) +{ +	struct event_command *p; +	int ret = 0; + +	mutex_lock(&trigger_cmd_mutex); +	list_for_each_entry(p, &trigger_commands, list) { +		if (strcmp(cmd->name, p->name) == 0) { +			ret = -EBUSY; +			goto out_unlock; +		} +	} +	list_add(&cmd->list, &trigger_commands); + out_unlock: +	mutex_unlock(&trigger_cmd_mutex); + +	return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +static __init int unregister_event_command(struct event_command *cmd) +{ +	struct event_command *p, *n; +	int ret = -ENODEV; + +	mutex_lock(&trigger_cmd_mutex); +	list_for_each_entry_safe(p, n, &trigger_commands, list) { +		if (strcmp(cmd->name, p->name) == 0) { +			ret = 0; +			list_del_init(&p->list); +			goto out_unlock; +		} +	} + out_unlock: +	mutex_unlock(&trigger_cmd_mutex); + +	return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, +		    void *data, char *filter_str) +{ +	long count = (long)data; + +	seq_printf(m, "%s", name); + +	if (count == -1) +		seq_puts(m, ":unlimited"); +	else +		seq_printf(m, ":count=%ld", count); + +	if (filter_str) +		seq_printf(m, " if %s\n", filter_str); +	else +		seq_puts(m, "\n"); + +	return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_init(struct event_trigger_ops *ops, +		   struct event_trigger_data *data) +{ +	data->ref++; +	return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_ops *ops, +		   struct event_trigger_data *data) +{ +	if (WARN_ON_ONCE(data->ref <= 0)) +		return; + +	data->ref--; +	if (!data->ref) +		trigger_data_free(data); +} + +static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, +					      int trigger_enable) +{ +	int ret = 0; + +	if (trigger_enable) { +		if (atomic_inc_return(&file->tm_ref) > 1) +			return ret; +		set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); +		ret = trace_event_enable_disable(file, 1, 1); +	} else { +		if (atomic_dec_return(&file->tm_ref) > 0) +			return ret; +		clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); +		ret = trace_event_enable_disable(file, 0, 1); +	} + +	return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable().  That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	list_for_each_entry(file, &tr->events, list) { +		struct event_trigger_data *data; +		list_for_each_entry_rcu(data, &file->triggers, list) { +			trace_event_trigger_enable_disable(file, 0); +			if (data->ops->free) +				data->ops->free(data->ops, data); +		} +	} +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The ftrace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +static void update_cond_flag(struct ftrace_event_file *file) +{ +	struct event_trigger_data *data; +	bool set_cond = false; + +	list_for_each_entry_rcu(data, &file->triggers, list) { +		if (data->filter || data->cmd_ops->post_trigger) { +			set_cond = true; +			break; +		} +	} + +	if (set_cond) +		set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +	else +		clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, struct event_trigger_ops *ops, +			    struct event_trigger_data *data, +			    struct ftrace_event_file *file) +{ +	struct event_trigger_data *test; +	int ret = 0; + +	list_for_each_entry_rcu(test, &file->triggers, list) { +		if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { +			ret = -EEXIST; +			goto out; +		} +	} + +	if (data->ops->init) { +		ret = data->ops->init(data->ops, data); +		if (ret < 0) +			goto out; +	} + +	list_add_rcu(&data->list, &file->triggers); +	ret++; + +	if (trace_event_trigger_enable_disable(file, 1) < 0) { +		list_del_rcu(&data->list); +		ret--; +	} +	update_cond_flag(file); +out: +	return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, +			       struct event_trigger_data *test, +			       struct ftrace_event_file *file) +{ +	struct event_trigger_data *data; +	bool unregistered = false; + +	list_for_each_entry_rcu(data, &file->triggers, list) { +		if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { +			unregistered = true; +			list_del_rcu(&data->list); +			update_cond_flag(file); +			trace_event_trigger_enable_disable(file, 0); +			break; +		} +	} + +	if (unregistered && data->ops->free) +		data->ops->free(data->ops, data); +} + +/** + * event_trigger_callback - Generic event_command @func implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The ftrace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param: The params portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @func method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_callback(struct event_command *cmd_ops, +		       struct ftrace_event_file *file, +		       char *glob, char *cmd, char *param) +{ +	struct event_trigger_data *trigger_data; +	struct event_trigger_ops *trigger_ops; +	char *trigger = NULL; +	char *number; +	int ret; + +	/* separate the trigger from the filter (t:n [if filter]) */ +	if (param && isdigit(param[0])) +		trigger = strsep(¶m, " \t"); + +	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + +	ret = -ENOMEM; +	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); +	if (!trigger_data) +		goto out; + +	trigger_data->count = -1; +	trigger_data->ops = trigger_ops; +	trigger_data->cmd_ops = cmd_ops; +	INIT_LIST_HEAD(&trigger_data->list); + +	if (glob[0] == '!') { +		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); +		kfree(trigger_data); +		ret = 0; +		goto out; +	} + +	if (trigger) { +		number = strsep(&trigger, ":"); + +		ret = -EINVAL; +		if (!strlen(number)) +			goto out_free; + +		/* +		 * We use the callback data field (which is a pointer) +		 * as our counter. +		 */ +		ret = kstrtoul(number, 0, &trigger_data->count); +		if (ret) +			goto out_free; +	} + +	if (!param) /* if param is non-empty, it's supposed to be a filter */ +		goto out_reg; + +	if (!cmd_ops->set_filter) +		goto out_reg; + +	ret = cmd_ops->set_filter(param, trigger_data, file); +	if (ret < 0) +		goto out_free; + + out_reg: +	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); +	/* +	 * The above returns on success the # of functions enabled, +	 * but if it didn't find any functions it returns zero. +	 * Consider no functions a failure too. +	 */ +	if (!ret) { +		ret = -ENOENT; +		goto out_free; +	} else if (ret < 0) +		goto out_free; +	ret = 0; + out: +	return ret; + + out_free: +	if (cmd_ops->set_filter) +		cmd_ops->set_filter(NULL, trigger_data, NULL); +	kfree(trigger_data); +	goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +static int set_trigger_filter(char *filter_str, +			      struct event_trigger_data *trigger_data, +			      struct ftrace_event_file *file) +{ +	struct event_trigger_data *data = trigger_data; +	struct event_filter *filter = NULL, *tmp; +	int ret = -EINVAL; +	char *s; + +	if (!filter_str) /* clear the current filter */ +		goto assign; + +	s = strsep(&filter_str, " \t"); + +	if (!strlen(s) || strcmp(s, "if") != 0) +		goto out; + +	if (!filter_str) +		goto out; + +	/* The filter is for the 'trigger' event, not the triggered event */ +	ret = create_event_filter(file->event_call, filter_str, false, &filter); +	if (ret) +		goto out; + assign: +	tmp = rcu_access_pointer(data->filter); + +	rcu_assign_pointer(data->filter, filter); + +	if (tmp) { +		/* Make sure the call is done with the filter */ +		synchronize_sched(); +		free_event_filter(tmp); +	} + +	kfree(data->filter_str); +	data->filter_str = NULL; + +	if (filter_str) { +		data->filter_str = kstrdup(filter_str, GFP_KERNEL); +		if (!data->filter_str) { +			free_event_filter(rcu_access_pointer(data->filter)); +			data->filter = NULL; +			ret = -ENOMEM; +		} +	} + out: +	return ret; +} + +static void +traceon_trigger(struct event_trigger_data *data) +{ +	if (tracing_is_on()) +		return; + +	tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data) +{ +	if (tracing_is_on()) +		return; + +	if (!data->count) +		return; + +	if (data->count != -1) +		(data->count)--; + +	tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data) +{ +	if (!tracing_is_on()) +		return; + +	tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data) +{ +	if (!tracing_is_on()) +		return; + +	if (!data->count) +		return; + +	if (data->count != -1) +		(data->count)--; + +	tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, +		      struct event_trigger_data *data) +{ +	return event_trigger_print("traceon", m, (void *)data->count, +				   data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, +		       struct event_trigger_data *data) +{ +	return event_trigger_print("traceoff", m, (void *)data->count, +				   data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { +	.func			= traceon_trigger, +	.print			= traceon_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { +	.func			= traceon_count_trigger, +	.print			= traceon_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { +	.func			= traceoff_trigger, +	.print			= traceoff_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { +	.func			= traceoff_count_trigger, +	.print			= traceoff_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ +	struct event_trigger_ops *ops; + +	/* we register both traceon and traceoff to this callback */ +	if (strcmp(cmd, "traceon") == 0) +		ops = param ? &traceon_count_trigger_ops : +			&traceon_trigger_ops; +	else +		ops = param ? &traceoff_count_trigger_ops : +			&traceoff_trigger_ops; + +	return ops; +} + +static struct event_command trigger_traceon_cmd = { +	.name			= "traceon", +	.trigger_type		= ETT_TRACE_ONOFF, +	.func			= event_trigger_callback, +	.reg			= register_trigger, +	.unreg			= unregister_trigger, +	.get_trigger_ops	= onoff_get_trigger_ops, +	.set_filter		= set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { +	.name			= "traceoff", +	.trigger_type		= ETT_TRACE_ONOFF, +	.func			= event_trigger_callback, +	.reg			= register_trigger, +	.unreg			= unregister_trigger, +	.get_trigger_ops	= onoff_get_trigger_ops, +	.set_filter		= set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data) +{ +	tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data) +{ +	if (!data->count) +		return; + +	if (data->count != -1) +		(data->count)--; + +	snapshot_trigger(data); +} + +static int +register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, +			  struct event_trigger_data *data, +			  struct ftrace_event_file *file) +{ +	int ret = register_trigger(glob, ops, data, file); + +	if (ret > 0 && tracing_alloc_snapshot() != 0) { +		unregister_trigger(glob, ops, data, file); +		ret = 0; +	} + +	return ret; +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, +		       struct event_trigger_data *data) +{ +	return event_trigger_print("snapshot", m, (void *)data->count, +				   data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { +	.func			= snapshot_trigger, +	.print			= snapshot_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { +	.func			= snapshot_count_trigger, +	.print			= snapshot_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ +	return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { +	.name			= "snapshot", +	.trigger_type		= ETT_SNAPSHOT, +	.func			= event_trigger_callback, +	.reg			= register_snapshot_trigger, +	.unreg			= unregister_trigger, +	.get_trigger_ops	= snapshot_get_trigger_ops, +	.set_filter		= set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ +	int ret; + +	ret = register_event_command(&trigger_snapshot_cmd); +	WARN_ON(ret < 0); + +	return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +/* + * Skip 3: + *   stacktrace_trigger() + *   event_triggers_post_call() + *   ftrace_raw_event_xxx() + */ +#define STACK_SKIP 3 + +static void +stacktrace_trigger(struct event_trigger_data *data) +{ +	trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data) +{ +	if (!data->count) +		return; + +	if (data->count != -1) +		(data->count)--; + +	stacktrace_trigger(data); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, +			 struct event_trigger_data *data) +{ +	return event_trigger_print("stacktrace", m, (void *)data->count, +				   data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { +	.func			= stacktrace_trigger, +	.print			= stacktrace_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { +	.func			= stacktrace_count_trigger, +	.print			= stacktrace_trigger_print, +	.init			= event_trigger_init, +	.free			= event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ +	return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { +	.name			= "stacktrace", +	.trigger_type		= ETT_STACKTRACE, +	.post_trigger		= true, +	.func			= event_trigger_callback, +	.reg			= register_trigger, +	.unreg			= unregister_trigger, +	.get_trigger_ops	= stacktrace_get_trigger_ops, +	.set_filter		= set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ +	int ret; + +	ret = register_event_command(&trigger_stacktrace_cmd); +	WARN_ON(ret < 0); + +	return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ +	unregister_event_command(&trigger_traceon_cmd); +	unregister_event_command(&trigger_traceoff_cmd); +} + +/* Avoid typos */ +#define ENABLE_EVENT_STR	"enable_event" +#define DISABLE_EVENT_STR	"disable_event" + +struct enable_trigger_data { +	struct ftrace_event_file	*file; +	bool				enable; +}; + +static void +event_enable_trigger(struct event_trigger_data *data) +{ +	struct enable_trigger_data *enable_data = data->private_data; + +	if (enable_data->enable) +		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +	else +		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data) +{ +	struct enable_trigger_data *enable_data = data->private_data; + +	if (!data->count) +		return; + +	/* Skip if the event is in a state we want to switch to */ +	if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		return; + +	if (data->count != -1) +		(data->count)--; + +	event_enable_trigger(data); +} + +static int +event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, +			   struct event_trigger_data *data) +{ +	struct enable_trigger_data *enable_data = data->private_data; + +	seq_printf(m, "%s:%s:%s", +		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, +		   enable_data->file->event_call->class->system, +		   ftrace_event_name(enable_data->file->event_call)); + +	if (data->count == -1) +		seq_puts(m, ":unlimited"); +	else +		seq_printf(m, ":count=%ld", data->count); + +	if (data->filter_str) +		seq_printf(m, " if %s\n", data->filter_str); +	else +		seq_puts(m, "\n"); + +	return 0; +} + +static void +event_enable_trigger_free(struct event_trigger_ops *ops, +			  struct event_trigger_data *data) +{ +	struct enable_trigger_data *enable_data = data->private_data; + +	if (WARN_ON_ONCE(data->ref <= 0)) +		return; + +	data->ref--; +	if (!data->ref) { +		/* Remove the SOFT_MODE flag */ +		trace_event_enable_disable(enable_data->file, 0, 1); +		module_put(enable_data->file->event_call->mod); +		trigger_data_free(data); +		kfree(enable_data); +	} +} + +static struct event_trigger_ops event_enable_trigger_ops = { +	.func			= event_enable_trigger, +	.print			= event_enable_trigger_print, +	.init			= event_trigger_init, +	.free			= event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { +	.func			= event_enable_count_trigger, +	.print			= event_enable_trigger_print, +	.init			= event_trigger_init, +	.free			= event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { +	.func			= event_enable_trigger, +	.print			= event_enable_trigger_print, +	.init			= event_trigger_init, +	.free			= event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { +	.func			= event_enable_count_trigger, +	.print			= event_enable_trigger_print, +	.init			= event_trigger_init, +	.free			= event_enable_trigger_free, +}; + +static int +event_enable_trigger_func(struct event_command *cmd_ops, +			  struct ftrace_event_file *file, +			  char *glob, char *cmd, char *param) +{ +	struct ftrace_event_file *event_enable_file; +	struct enable_trigger_data *enable_data; +	struct event_trigger_data *trigger_data; +	struct event_trigger_ops *trigger_ops; +	struct trace_array *tr = file->tr; +	const char *system; +	const char *event; +	char *trigger; +	char *number; +	bool enable; +	int ret; + +	if (!param) +		return -EINVAL; + +	/* separate the trigger from the filter (s:e:n [if filter]) */ +	trigger = strsep(¶m, " \t"); +	if (!trigger) +		return -EINVAL; + +	system = strsep(&trigger, ":"); +	if (!trigger) +		return -EINVAL; + +	event = strsep(&trigger, ":"); + +	ret = -EINVAL; +	event_enable_file = find_event_file(tr, system, event); +	if (!event_enable_file) +		goto out; + +	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + +	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + +	ret = -ENOMEM; +	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); +	if (!trigger_data) +		goto out; + +	enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); +	if (!enable_data) { +		kfree(trigger_data); +		goto out; +	} + +	trigger_data->count = -1; +	trigger_data->ops = trigger_ops; +	trigger_data->cmd_ops = cmd_ops; +	INIT_LIST_HEAD(&trigger_data->list); +	RCU_INIT_POINTER(trigger_data->filter, NULL); + +	enable_data->enable = enable; +	enable_data->file = event_enable_file; +	trigger_data->private_data = enable_data; + +	if (glob[0] == '!') { +		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); +		kfree(trigger_data); +		kfree(enable_data); +		ret = 0; +		goto out; +	} + +	if (trigger) { +		number = strsep(&trigger, ":"); + +		ret = -EINVAL; +		if (!strlen(number)) +			goto out_free; + +		/* +		 * We use the callback data field (which is a pointer) +		 * as our counter. +		 */ +		ret = kstrtoul(number, 0, &trigger_data->count); +		if (ret) +			goto out_free; +	} + +	if (!param) /* if param is non-empty, it's supposed to be a filter */ +		goto out_reg; + +	if (!cmd_ops->set_filter) +		goto out_reg; + +	ret = cmd_ops->set_filter(param, trigger_data, file); +	if (ret < 0) +		goto out_free; + + out_reg: +	/* Don't let event modules unload while probe registered */ +	ret = try_module_get(event_enable_file->event_call->mod); +	if (!ret) { +		ret = -EBUSY; +		goto out_free; +	} + +	ret = trace_event_enable_disable(event_enable_file, 1, 1); +	if (ret < 0) +		goto out_put; +	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); +	/* +	 * The above returns on success the # of functions enabled, +	 * but if it didn't find any functions it returns zero. +	 * Consider no functions a failure too. +	 */ +	if (!ret) { +		ret = -ENOENT; +		goto out_disable; +	} else if (ret < 0) +		goto out_disable; +	/* Just return zero, not the number of enabled functions */ +	ret = 0; + out: +	return ret; + + out_disable: +	trace_event_enable_disable(event_enable_file, 0, 1); + out_put: +	module_put(event_enable_file->event_call->mod); + out_free: +	if (cmd_ops->set_filter) +		cmd_ops->set_filter(NULL, trigger_data, NULL); +	kfree(trigger_data); +	kfree(enable_data); +	goto out; +} + +static int event_enable_register_trigger(char *glob, +					 struct event_trigger_ops *ops, +					 struct event_trigger_data *data, +					 struct ftrace_event_file *file) +{ +	struct enable_trigger_data *enable_data = data->private_data; +	struct enable_trigger_data *test_enable_data; +	struct event_trigger_data *test; +	int ret = 0; + +	list_for_each_entry_rcu(test, &file->triggers, list) { +		test_enable_data = test->private_data; +		if (test_enable_data && +		    (test_enable_data->file == enable_data->file)) { +			ret = -EEXIST; +			goto out; +		} +	} + +	if (data->ops->init) { +		ret = data->ops->init(data->ops, data); +		if (ret < 0) +			goto out; +	} + +	list_add_rcu(&data->list, &file->triggers); +	ret++; + +	if (trace_event_trigger_enable_disable(file, 1) < 0) { +		list_del_rcu(&data->list); +		ret--; +	} +	update_cond_flag(file); +out: +	return ret; +} + +static void event_enable_unregister_trigger(char *glob, +					    struct event_trigger_ops *ops, +					    struct event_trigger_data *test, +					    struct ftrace_event_file *file) +{ +	struct enable_trigger_data *test_enable_data = test->private_data; +	struct enable_trigger_data *enable_data; +	struct event_trigger_data *data; +	bool unregistered = false; + +	list_for_each_entry_rcu(data, &file->triggers, list) { +		enable_data = data->private_data; +		if (enable_data && +		    (enable_data->file == test_enable_data->file)) { +			unregistered = true; +			list_del_rcu(&data->list); +			update_cond_flag(file); +			trace_event_trigger_enable_disable(file, 0); +			break; +		} +	} + +	if (unregistered && data->ops->free) +		data->ops->free(data->ops, data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ +	struct event_trigger_ops *ops; +	bool enable; + +	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + +	if (enable) +		ops = param ? &event_enable_count_trigger_ops : +			&event_enable_trigger_ops; +	else +		ops = param ? &event_disable_count_trigger_ops : +			&event_disable_trigger_ops; + +	return ops; +} + +static struct event_command trigger_enable_cmd = { +	.name			= ENABLE_EVENT_STR, +	.trigger_type		= ETT_EVENT_ENABLE, +	.func			= event_enable_trigger_func, +	.reg			= event_enable_register_trigger, +	.unreg			= event_enable_unregister_trigger, +	.get_trigger_ops	= event_enable_get_trigger_ops, +	.set_filter		= set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { +	.name			= DISABLE_EVENT_STR, +	.trigger_type		= ETT_EVENT_ENABLE, +	.func			= event_enable_trigger_func, +	.reg			= event_enable_register_trigger, +	.unreg			= event_enable_unregister_trigger, +	.get_trigger_ops	= event_enable_get_trigger_ops, +	.set_filter		= set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ +	unregister_event_command(&trigger_enable_cmd); +	unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ +	int ret; + +	ret = register_event_command(&trigger_enable_cmd); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = register_event_command(&trigger_disable_cmd); +	if (WARN_ON(ret < 0)) +		unregister_trigger_enable_disable_cmds(); + +	return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ +	int ret; + +	ret = register_event_command(&trigger_traceon_cmd); +	if (WARN_ON(ret < 0)) +		return ret; +	ret = register_event_command(&trigger_traceoff_cmd); +	if (WARN_ON(ret < 0)) +		unregister_trigger_traceon_traceoff_cmds(); + +	return ret; +} + +__init int register_trigger_cmds(void) +{ +	register_trigger_traceon_traceoff_cmds(); +	register_trigger_snapshot_cmd(); +	register_trigger_stacktrace_cmd(); +	register_trigger_enable_disable_cmds(); + +	return 0; +} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac2..d4ddde28a81 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,16 @@  #undef TRACE_SYSTEM  #define TRACE_SYSTEM	ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ +			 filter, regfn) \ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter) +  /* not needed for this file */  #undef __field_struct  #define __field_struct(type, item) @@ -44,21 +54,22 @@  #define F_printk(fmt, args...) fmt, args  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\ -struct ____ftrace_##name {					\ -	tstruct							\ -};								\ -static void __always_unused ____ftrace_check_##name(void)	\ -{								\ -	struct ____ftrace_##name *__entry = NULL;		\ -								\ -	/* force compile-time check on F_printk() */		\ -	printk(print);						\ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ +struct ____ftrace_##name {						\ +	tstruct								\ +};									\ +static void __always_unused ____ftrace_check_##name(void)		\ +{									\ +	struct ____ftrace_##name *__entry = NULL;			\ +									\ +	/* force compile-time check on F_printk() */			\ +	printk(print);							\  }  #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print)	\ -	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter)	\ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter)  #include "trace_entries.h" @@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void)	\  	ret = trace_define_field(event_call, #type, #item,		\  				 offsetof(typeof(field), item),		\  				 sizeof(field.item),			\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  	if (ret)							\  		return ret; @@ -77,19 +88,22 @@ static void __always_unused ____ftrace_check_##name(void)	\  				 offsetof(typeof(field),		\  					  container.item),		\  				 sizeof(field.container.item),		\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  	if (ret)							\  		return ret;  #undef __array  #define __array(type, item, len)					\ -	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\ -	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\ +	do {								\ +		char *type_str = #type"["__stringify(len)"]";		\ +		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\ +		ret = trace_define_field(event_call, type_str, #item,	\  				 offsetof(typeof(field), item),		\  				 sizeof(field.item),			\ -				 is_signed_type(type), FILTER_OTHER);	\ -	if (ret)							\ -		return ret; +				 is_signed_type(type), filter_type);	\ +		if (ret)						\ +			return ret;					\ +	} while (0);  #undef __array_desc  #define __array_desc(type, container, item, len)			\ @@ -98,7 +112,7 @@ static void __always_unused ____ftrace_check_##name(void)	\  				 offsetof(typeof(field),		\  					  container.item),		\  				 sizeof(field.container.item),		\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  	if (ret)							\  		return ret; @@ -106,17 +120,18 @@ static void __always_unused ____ftrace_check_##name(void)	\  #define __dynamic_array(type, item)					\  	ret = trace_define_field(event_call, #type, #item,		\  				 offsetof(typeof(field), item),		\ -				 0, is_signed_type(type), FILTER_OTHER);\ +				 0, is_signed_type(type), filter_type);\  	if (ret)							\  		return ret;  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\ -int									\ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ +static int __init							\  ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  {									\  	struct struct_name field;					\  	int ret;							\ +	int filter_type = filter;					\  									\  	tstruct;							\  									\ @@ -144,24 +159,39 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  #define __dynamic_array(type, item)  #undef F_printk -#define F_printk(fmt, args...) #fmt ", "  __stringify(args) +#define F_printk(fmt, args...) __stringify(fmt) ", "  __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)		\ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ +			 regfn)						\  									\ -struct ftrace_event_class event_class_ftrace_##call = {			\ +struct ftrace_event_class __refdata event_class_ftrace_##call = {	\  	.system			= __stringify(TRACE_SYSTEM),		\  	.define_fields		= ftrace_define_fields_##call,		\  	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ +	.reg			= regfn,				\  };									\  									\ -struct ftrace_event_call __used						\ -__attribute__((__aligned__(4)))						\ -__attribute__((section("_ftrace_events"))) event_##call = {		\ -	.name			= #call,				\ -	.event.type		= etype,				\ +struct ftrace_event_call __used event_##call = {			\  	.class			= &event_class_ftrace_##call,		\ +	{								\ +		.name			= #call,			\ +	},								\ +	.event.type		= etype,				\  	.print_fmt		= print,				\ +	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \  };									\ +struct ftrace_event_call __used						\ +__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter)	\ +	FTRACE_ENTRY_REG(call, struct_name, etype,			\ +			 PARAMS(tstruct), PARAMS(print), filter, NULL) + +int ftrace_event_is_function(struct ftrace_event_call *call) +{ +	return call == &event_function; +}  #include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8..57f0ec962d2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -7,115 +7,164 @@   * Based on code from the latency_tracer, that is:   *   *  Copyright (C) 2004-2006 Ingo Molnar - *  Copyright (C) 2004 William Lee Irwin III + *  Copyright (C) 2004 Nadia Yvette Chambers   */  #include <linux/ring_buffer.h>  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> +#include <linux/slab.h>  #include <linux/fs.h>  #include "trace.h" -/* function tracing enabled */ -static int			ftrace_function_enabled; +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, +		    struct ftrace_ops *op, struct pt_regs *pt_regs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, +			  struct ftrace_ops *op, struct pt_regs *pt_regs); +static struct tracer_flags func_flags; + +/* Our option */ +enum { +	TRACE_FUNC_OPT_STACK	= 0x1, +}; + +static int allocate_ftrace_ops(struct trace_array *tr) +{ +	struct ftrace_ops *ops; + +	ops = kzalloc(sizeof(*ops), GFP_KERNEL); +	if (!ops) +		return -ENOMEM; -static struct trace_array	*func_trace; +	/* Currently only the non stack verision is supported */ +	ops->func = function_trace_call; +	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; -static void tracing_start_function_trace(void); -static void tracing_stop_function_trace(void); +	tr->ops = ops; +	ops->private = tr; +	return 0; +} + + +int ftrace_create_function_files(struct trace_array *tr, +				 struct dentry *parent) +{ +	int ret; + +	/* +	 * The top level array uses the "global_ops", and the files are +	 * created on boot up. +	 */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) +		return 0; + +	ret = allocate_ftrace_ops(tr); +	if (ret) +		return ret; + +	ftrace_create_filter_files(tr->ops, parent); + +	return 0; +} + +void ftrace_destroy_function_files(struct trace_array *tr) +{ +	ftrace_destroy_filter_files(tr->ops); +	kfree(tr->ops); +	tr->ops = NULL; +}  static int function_trace_init(struct trace_array *tr)  { -	func_trace = tr; -	tr->cpu = get_cpu(); +	ftrace_func_t func; + +	/* +	 * Instance trace_arrays get their ops allocated +	 * at instance creation. Unless it failed +	 * the allocation. +	 */ +	if (!tr->ops) +		return -ENOMEM; + +	/* Currently only the global instance can do stack tracing */ +	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && +	    func_flags.val & TRACE_FUNC_OPT_STACK) +		func = function_stack_trace_call; +	else +		func = function_trace_call; + +	ftrace_init_array_ops(tr, func); + +	tr->trace_buffer.cpu = get_cpu();  	put_cpu();  	tracing_start_cmdline_record(); -	tracing_start_function_trace(); +	tracing_start_function_trace(tr);  	return 0;  }  static void function_trace_reset(struct trace_array *tr)  { -	tracing_stop_function_trace(); +	tracing_stop_function_trace(tr);  	tracing_stop_cmdline_record(); +	ftrace_reset_array_ops(tr);  }  static void function_trace_start(struct trace_array *tr)  { -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +function_trace_call(unsigned long ip, unsigned long parent_ip, +		    struct ftrace_ops *op, struct pt_regs *pt_regs)  { -	struct trace_array *tr = func_trace; +	struct trace_array *tr = op->private;  	struct trace_array_cpu *data;  	unsigned long flags; -	long disabled; +	int bit;  	int cpu;  	int pc; -	if (unlikely(!ftrace_function_enabled)) +	if (unlikely(!tr->function_enabled))  		return;  	pc = preempt_count();  	preempt_disable_notrace(); -	local_save_flags(flags); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); - -	if (likely(disabled == 1)) -		trace_function(tr, ip, parent_ip, flags, pc); - -	atomic_dec(&data->disabled); -	preempt_enable_notrace(); -} - -static void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ -	struct trace_array *tr = func_trace; -	struct trace_array_cpu *data; -	unsigned long flags; -	long disabled; -	int cpu; -	int pc; - -	if (unlikely(!ftrace_function_enabled)) -		return; -	/* -	 * Need to use raw, since this must be called before the -	 * recursive protection is performed. -	 */ -	local_irq_save(flags); -	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; -	disabled = atomic_inc_return(&data->disabled); +	bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); +	if (bit < 0) +		goto out; -	if (likely(disabled == 1)) { -		pc = preempt_count(); +	cpu = smp_processor_id(); +	data = per_cpu_ptr(tr->trace_buffer.data, cpu); +	if (!atomic_read(&data->disabled)) { +		local_save_flags(flags);  		trace_function(tr, ip, parent_ip, flags, pc);  	} +	trace_clear_recursion(bit); -	atomic_dec(&data->disabled); -	local_irq_restore(flags); + out: +	preempt_enable_notrace();  }  static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, +			  struct ftrace_ops *op, struct pt_regs *pt_regs)  { -	struct trace_array *tr = func_trace; +	struct trace_array *tr = op->private;  	struct trace_array_cpu *data;  	unsigned long flags;  	long disabled;  	int cpu;  	int pc; -	if (unlikely(!ftrace_function_enabled)) +	if (unlikely(!tr->function_enabled))  		return;  	/* @@ -124,7 +173,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)  	 */  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) { @@ -145,22 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)  	local_irq_restore(flags);  } - -static struct ftrace_ops trace_ops __read_mostly = -{ -	.func = function_trace_call, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ -	.func = function_stack_trace_call, -}; - -/* Our two options */ -enum { -	TRACE_FUNC_OPT_STACK = 0x1, -}; -  static struct tracer_opt func_opts[] = {  #ifdef CONFIG_STACKTRACE  	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -173,129 +206,159 @@ static struct tracer_flags func_flags = {  	.opts = func_opts  }; -static void tracing_start_function_trace(void) +static void tracing_start_function_trace(struct trace_array *tr)  { -	ftrace_function_enabled = 0; - -	if (trace_flags & TRACE_ITER_PREEMPTONLY) -		trace_ops.func = function_trace_call_preempt_only; -	else -		trace_ops.func = function_trace_call; - -	if (func_flags.val & TRACE_FUNC_OPT_STACK) -		register_ftrace_function(&trace_stack_ops); -	else -		register_ftrace_function(&trace_ops); - -	ftrace_function_enabled = 1; +	tr->function_enabled = 0; +	register_ftrace_function(tr->ops); +	tr->function_enabled = 1;  } -static void tracing_stop_function_trace(void) +static void tracing_stop_function_trace(struct trace_array *tr)  { -	ftrace_function_enabled = 0; - -	if (func_flags.val & TRACE_FUNC_OPT_STACK) -		unregister_ftrace_function(&trace_stack_ops); -	else -		unregister_ftrace_function(&trace_ops); +	tr->function_enabled = 0; +	unregister_ftrace_function(tr->ops);  } -static int func_set_flag(u32 old_flags, u32 bit, int set) +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  { -	if (bit == TRACE_FUNC_OPT_STACK) { +	switch (bit) { +	case TRACE_FUNC_OPT_STACK:  		/* do nothing if already set */  		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) -			return 0; +			break; + +		unregister_ftrace_function(tr->ops);  		if (set) { -			unregister_ftrace_function(&trace_ops); -			register_ftrace_function(&trace_stack_ops); +			tr->ops->func = function_stack_trace_call; +			register_ftrace_function(tr->ops);  		} else { -			unregister_ftrace_function(&trace_stack_ops); -			register_ftrace_function(&trace_ops); +			tr->ops->func = function_trace_call; +			register_ftrace_function(tr->ops);  		} -		return 0; +		break; +	default: +		return -EINVAL;  	} -	return -EINVAL; +	return 0;  } -static struct tracer function_trace __read_mostly = +static struct tracer function_trace __tracer_data =  {  	.name		= "function",  	.init		= function_trace_init,  	.reset		= function_trace_reset,  	.start		= function_trace_start, -	.wait_pipe	= poll_wait_pipe,  	.flags		= &func_flags,  	.set_flag	= func_set_flag, +	.allow_instances = true,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest	= trace_selftest_startup_function,  #endif  };  #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data)  { -	long *count = (long *)data; - -	if (tracing_is_on()) -		return; +	unsigned long *count = (long *)data;  	if (!*count) -		return; +		return 0;  	if (*count != -1)  		(*count)--; -	tracing_on(); +	return 1;  }  static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)  { -	long *count = (long *)data; +	if (tracing_is_on()) +		return; +	if (update_count(data)) +		tracing_on(); +} + +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{  	if (!tracing_is_on())  		return; -	if (!*count) +	if (update_count(data)) +		tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (tracing_is_on())  		return; -	if (*count != -1) -		(*count)--; +	tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return;  	tracing_off();  } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + *   ftrace_stacktrace() + *   function_trace_probe_call() + *   ftrace_ops_list_func() + *   ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { -	.func			= ftrace_traceon, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ +	trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { -	.func			= ftrace_traceoff, -	.print			= ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (!tracing_is_on()) +		return; + +	if (update_count(data)) +		trace_dump_stack(STACK_SKIP); +} + +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (update_count(data)) +		ftrace_dump(DUMP_ALL); +} + +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (update_count(data)) +		ftrace_dump(DUMP_ORIG); +}  static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, -			 struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, +		   unsigned long ip, void *data)  {  	long count = (long)data; -	seq_printf(m, "%ps:", (void *)ip); - -	if (ops == &traceon_probe_ops) -		seq_printf(m, "traceon"); -	else -		seq_printf(m, "traceoff"); +	seq_printf(m, "%ps:%s", (void *)ip, name);  	if (count == -1)  		seq_printf(m, ":unlimited\n"); @@ -306,25 +369,85 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,  }  static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data)  { -	struct ftrace_probe_ops *ops; +	return ftrace_probe_print("traceon", m, ip, data); +} -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, +			 struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("traceoff", m, ip, data); +} -	unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("stacktrace", m, ip, data); +} -	return 0; +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("dump", m, ip, data);  }  static int -ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("cpudump", m, ip, data); +} + +static struct ftrace_probe_ops traceon_count_probe_ops = { +	.func			= ftrace_traceon_count, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { +	.func			= ftrace_traceoff_count, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { +	.func			= ftrace_stacktrace_count, +	.print			= ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops dump_probe_ops = { +	.func			= ftrace_dump_probe, +	.print			= ftrace_dump_print, +}; + +static struct ftrace_probe_ops cpudump_probe_ops = { +	.func			= ftrace_cpudump_probe, +	.print			= ftrace_cpudump_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { +	.func			= ftrace_traceon, +	.print			= ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { +	.func			= ftrace_traceoff, +	.print			= ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { +	.func			= ftrace_stacktrace, +	.print			= ftrace_stacktrace_print, +}; + +static int +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +			    struct ftrace_hash *hash, char *glob, +			    char *cmd, char *param, int enable)  { -	struct ftrace_probe_ops *ops;  	void *count = (void *)-1;  	char *number;  	int ret; @@ -333,14 +456,10 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)  	if (!enable)  		return -EINVAL; -	if (glob[0] == '!') -		return ftrace_trace_onoff_unreg(glob+1, cmd, param); - -	/* we register both traceon and traceoff to this callback */ -	if (strcmp(cmd, "traceon") == 0) -		ops = &traceon_probe_ops; -	else -		ops = &traceoff_probe_ops; +	if (glob[0] == '!') { +		unregister_ftrace_function_probe_func(glob+1, ops); +		return 0; +	}  	if (!param)  		goto out_reg; @@ -354,7 +473,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)  	 * We use the callback data field (which is a pointer)  	 * as our counter.  	 */ -	ret = strict_strtoul(number, 0, (unsigned long *)&count); +	ret = kstrtoul(number, 0, (unsigned long *)&count);  	if (ret)  		return ret; @@ -364,6 +483,60 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)  	return ret < 0 ? ret : 0;  } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, +			    char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	/* we register both traceon and traceoff to this callback */ +	if (strcmp(cmd, "traceon") == 0) +		ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; +	else +		ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   param, enable); +} + +static int +ftrace_dump_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = &dump_probe_ops; + +	/* Only dump once. */ +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   "1", enable); +} + +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = &cpudump_probe_ops; + +	/* Only dump once. */ +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   "1", enable); +} +  static struct ftrace_func_command ftrace_traceon_cmd = {  	.name			= "traceon",  	.func			= ftrace_trace_onoff_callback, @@ -374,6 +547,21 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {  	.func			= ftrace_trace_onoff_callback,  }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { +	.name			= "stacktrace", +	.func			= ftrace_stacktrace_callback, +}; + +static struct ftrace_func_command ftrace_dump_cmd = { +	.name			= "dump", +	.func			= ftrace_dump_callback, +}; + +static struct ftrace_func_command ftrace_cpudump_cmd = { +	.name			= "cpudump", +	.func			= ftrace_cpudump_callback, +}; +  static int __init init_func_cmd_traceon(void)  {  	int ret; @@ -384,7 +572,31 @@ static int __init init_func_cmd_traceon(void)  	ret = register_ftrace_command(&ftrace_traceon_cmd);  	if (ret) -		unregister_ftrace_command(&ftrace_traceoff_cmd); +		goto out_free_traceoff; + +	ret = register_ftrace_command(&ftrace_stacktrace_cmd); +	if (ret) +		goto out_free_traceon; + +	ret = register_ftrace_command(&ftrace_dump_cmd); +	if (ret) +		goto out_free_stacktrace; + +	ret = register_ftrace_command(&ftrace_cpudump_cmd); +	if (ret) +		goto out_free_dump; + +	return 0; + + out_free_dump: +	unregister_ftrace_command(&ftrace_dump_cmd); + out_free_stacktrace: +	unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: +	unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: +	unregister_ftrace_command(&ftrace_traceoff_cmd); +  	return ret;  }  #else @@ -399,5 +611,4 @@ static __init int init_function_trace(void)  	init_func_cmd_traceon();  	return register_tracer(&function_trace);  } -device_initcall(init_function_trace); - +core_initcall(init_function_trace); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b05980225..4de3e57f723 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,14 +38,7 @@ struct fgraph_data {  #define TRACE_GRAPH_INDENT	2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN	0x1 -#define TRACE_GRAPH_PRINT_CPU		0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD	0x4 -#define TRACE_GRAPH_PRINT_PROC		0x8 -#define TRACE_GRAPH_PRINT_DURATION	0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME	0x20 -#define TRACE_GRAPH_PRINT_IRQS		0x40 +static unsigned int max_depth;  static struct tracer_opt trace_opts[] = {  	/* Display overruns? (for self-debug purpose) */ @@ -62,11 +55,13 @@ static struct tracer_opt trace_opts[] = {  	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },  	/* Display interrupts */  	{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, +	/* Display function name after trailing } */ +	{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },  	{ } /* Empty entry */  };  static struct tracer_flags tracer_flags = { -	/* Don't display overruns and proc by default */ +	/* Don't display overruns, proc, or tail by default */  	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |  	       TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,  	.opts = trace_opts @@ -74,6 +69,20 @@ static struct tracer_flags tracer_flags = {  static struct trace_array *graph_array; +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { +	FLAGS_FILL_FULL  = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, +	FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, +	FLAGS_FILL_END   = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, +}; + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s, +		     u32 flags);  /* Add a function return address to the trace stack on thread info.*/  int @@ -98,16 +107,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,  		return -EBUSY;  	} +	/* +	 * The curr_ret_stack is an index to ftrace return stack of +	 * current task.  Its value should be in [0, FTRACE_RETFUNC_ +	 * DEPTH) when the function graph tracer is used.  To support +	 * filtering out specific functions, it makes the index +	 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) +	 * so when it sees a negative index the ftrace will ignore +	 * the record.  And the index gets recovered when returning +	 * from the filtered function by adding the FTRACE_NOTRACE_ +	 * DEPTH and then it'll continue to record functions normally. +	 * +	 * The curr_ret_stack is initialized to -1 and get increased +	 * in this function.  So it can be less than -1 only if it was +	 * filtered out via ftrace_graph_notrace_addr() which can be +	 * set from set_graph_notrace file in debugfs by user. +	 */ +	if (current->curr_ret_stack < -1) +		return -EBUSY; +  	calltime = trace_clock_local();  	index = ++current->curr_ret_stack; +	if (ftrace_graph_notrace_addr(func)) +		current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;  	barrier();  	current->ret_stack[index].ret = ret;  	current->ret_stack[index].func = func;  	current->ret_stack[index].calltime = calltime;  	current->ret_stack[index].subtime = 0;  	current->ret_stack[index].fp = frame_pointer; -	*depth = index; +	*depth = current->curr_ret_stack;  	return 0;  } @@ -121,7 +151,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  	index = current->curr_ret_stack; -	if (unlikely(index < 0)) { +	/* +	 * A negative index here means that it's just returned from a +	 * notrace'd function.  Recover index to get an original +	 * return address.  See ftrace_push_return_trace(). +	 * +	 * TODO: Need to check whether the stack gets corrupted. +	 */ +	if (index < 0) +		index += FTRACE_NOTRACE_DEPTH; + +	if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {  		ftrace_graph_stop();  		WARN_ON(1);  		/* Might as well panic, otherwise we have no where to go */ @@ -129,7 +169,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  		return;  	} -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)  	/*  	 * The arch may choose to record the frame pointer used  	 * and check it here to make sure that it is what we expect it @@ -140,6 +180,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  	 *  	 * Currently, x86_32 with optimize for size (-Os) makes the latest  	 * gcc do the above. +	 * +	 * Note, -mfentry does not use frame pointers, and this test +	 *  is not needed if CC_USING_FENTRY is set.  	 */  	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {  		ftrace_graph_stop(); @@ -172,9 +215,24 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)  	ftrace_pop_return_trace(&trace, &ret, frame_pointer);  	trace.rettime = trace_clock_local(); -	ftrace_graph_return(&trace);  	barrier();  	current->curr_ret_stack--; +	/* +	 * The curr_ret_stack can be less than -1 only if it was +	 * filtered out and it's about to return from the function. +	 * Recover the index and continue to trace normal functions. +	 */ +	if (current->curr_ret_stack < -1) { +		current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; +		return ret; +	} + +	/* +	 * The trace should run after decrementing the ret counter +	 * in case an interrupt were to come in. We don't want to +	 * lose the interrupt if max_depth is set. +	 */ +	ftrace_graph_return(&trace);  	if (unlikely(!ret)) {  		ftrace_graph_stop(); @@ -193,7 +251,7 @@ int __trace_graph_entry(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_entry;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ent_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -205,15 +263,15 @@ int __trace_graph_entry(struct trace_array *tr,  		return 0;  	entry	= ring_buffer_event_data(event);  	entry->graph_ent			= *trace; -	if (!filter_current_check_discard(buffer, call, entry, event)) -		ring_buffer_unlock_commit(buffer, event); +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event);  	return 1;  }  static inline int ftrace_graph_ignore_irqs(void)  { -	if (!ftrace_graph_skip_irqs) +	if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))  		return 0;  	return in_irq(); @@ -233,13 +291,24 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  		return 0;  	/* trace it when it is-nested-in or is a function enabled. */ -	if (!(trace->depth || ftrace_graph_addr(trace->func)) || -	      ftrace_graph_ignore_irqs()) +	if ((!(trace->depth || ftrace_graph_addr(trace->func)) || +	     ftrace_graph_ignore_irqs()) || (trace->depth < 0) || +	    (max_depth && trace->depth >= max_depth))  		return 0; +	/* +	 * Do not trace a function if it's filtered by set_graph_notrace. +	 * Make the index of ret stack negative to indicate that it should +	 * ignore further functions.  But it needs its own ret stack entry +	 * to recover the original index in order to continue tracing after +	 * returning from the function. +	 */ +	if (ftrace_graph_notrace_addr(trace->func)) +		return 1; +  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -297,7 +366,7 @@ void __trace_graph_return(struct trace_array *tr,  {  	struct ftrace_event_call *call = &event_funcgraph_exit;  	struct ring_buffer_event *event; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ftrace_graph_ret_entry *entry;  	if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -309,8 +378,8 @@ void __trace_graph_return(struct trace_array *tr,  		return;  	entry	= ring_buffer_event_data(event);  	entry->ret				= *trace; -	if (!filter_current_check_discard(buffer, call, entry, event)) -		ring_buffer_unlock_commit(buffer, event); +	if (!call_filter_check_discard(call, entry, buffer, event)) +		__buffer_unlock_commit(buffer, event);  }  void trace_graph_return(struct ftrace_graph_ret *trace) @@ -324,7 +393,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) {  		pc = preempt_count(); @@ -420,7 +489,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)  	/* First spaces to align center */  	for (i = 0; i < spaces / 2; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -431,7 +500,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)  	/* Last spaces to align center */  	for (i = 0; i < spaces - (spaces / 2); i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -477,7 +546,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)   ------------------------------------------   */ -	ret = trace_seq_printf(s, +	ret = trace_seq_puts(s,  		" ------------------------------------------\n");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -490,7 +559,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)  	if (ret == TRACE_TYPE_PARTIAL_LINE)  		return TRACE_TYPE_PARTIAL_LINE; -	ret = trace_seq_printf(s, " => "); +	ret = trace_seq_puts(s, " => ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -498,7 +567,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)  	if (ret == TRACE_TYPE_PARTIAL_LINE)  		return TRACE_TYPE_PARTIAL_LINE; -	ret = trace_seq_printf(s, +	ret = trace_seq_puts(s,  		"\n ------------------------------------------\n\n");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -524,7 +593,7 @@ get_return_for_leaf(struct trace_iterator *iter,  		next = &data->ret;  	} else { -		ring_iter = iter->buffer_iter[iter->cpu]; +		ring_iter = trace_buffer_iter(iter, iter->cpu);  		/* First peek to compare current entry and the next one */  		if (ring_iter) @@ -534,9 +603,9 @@ get_return_for_leaf(struct trace_iterator *iter,  			 * We need to consume the current entry to see  			 * the next one.  			 */ -			ring_buffer_consume(iter->tr->buffer, iter->cpu, +			ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,  					    NULL, NULL); -			event = ring_buffer_peek(iter->tr->buffer, iter->cpu, +			event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,  						 NULL, NULL);  		} @@ -577,32 +646,6 @@ get_return_for_leaf(struct trace_iterator *iter,  	return next;  } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s, -		     u32 flags) -{ -	/* If duration disappear, we don't need anything */ -	if (!(flags & TRACE_GRAPH_PRINT_DURATION)) -		return 1; - -	/* Non nested entry or return */ -	if (duration == -1) -		return trace_seq_printf(s, "  "); - -	if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { -		/* Duration exceeded 100 msecs */ -		if (duration > 100000ULL) -			return trace_seq_printf(s, "! "); - -		/* Duration exceeded 10 msecs */ -		if (duration > 10000ULL) -			return trace_seq_printf(s, "+ "); -	} - -	return trace_seq_printf(s, "  "); -} -  static int print_graph_abs_time(u64 t, struct trace_seq *s)  {  	unsigned long usecs_rem; @@ -625,47 +668,50 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  		addr >= (unsigned long)__irqentry_text_end)  		return TRACE_TYPE_UNHANDLED; -	/* Absolute time */ -	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { -		ret = print_graph_abs_time(iter->ts, s); -		if (!ret) -			return TRACE_TYPE_PARTIAL_LINE; -	} +	if (trace_flags & TRACE_ITER_CONTEXT_INFO) { +		/* Absolute time */ +		if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { +			ret = print_graph_abs_time(iter->ts, s); +			if (!ret) +				return TRACE_TYPE_PARTIAL_LINE; +		} -	/* Cpu */ -	if (flags & TRACE_GRAPH_PRINT_CPU) { -		ret = print_graph_cpu(s, cpu); -		if (ret == TRACE_TYPE_PARTIAL_LINE) -			return TRACE_TYPE_PARTIAL_LINE; -	} +		/* Cpu */ +		if (flags & TRACE_GRAPH_PRINT_CPU) { +			ret = print_graph_cpu(s, cpu); +			if (ret == TRACE_TYPE_PARTIAL_LINE) +				return TRACE_TYPE_PARTIAL_LINE; +		} -	/* Proc */ -	if (flags & TRACE_GRAPH_PRINT_PROC) { -		ret = print_graph_proc(s, pid); -		if (ret == TRACE_TYPE_PARTIAL_LINE) -			return TRACE_TYPE_PARTIAL_LINE; -		ret = trace_seq_printf(s, " | "); -		if (!ret) -			return TRACE_TYPE_PARTIAL_LINE; +		/* Proc */ +		if (flags & TRACE_GRAPH_PRINT_PROC) { +			ret = print_graph_proc(s, pid); +			if (ret == TRACE_TYPE_PARTIAL_LINE) +				return TRACE_TYPE_PARTIAL_LINE; +			ret = trace_seq_puts(s, " | "); +			if (!ret) +				return TRACE_TYPE_PARTIAL_LINE; +		}  	}  	/* No overhead */ -	ret = print_graph_overhead(-1, s, flags); -	if (!ret) -		return TRACE_TYPE_PARTIAL_LINE; +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); +	if (ret != TRACE_TYPE_HANDLED) +		return ret;  	if (type == TRACE_GRAPH_ENT) -		ret = trace_seq_printf(s, "==========>"); +		ret = trace_seq_puts(s, "==========>");  	else -		ret = trace_seq_printf(s, "<=========="); +		ret = trace_seq_puts(s, "<==========");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; -	/* Don't close the duration column if haven't one */ -	if (flags & TRACE_GRAPH_PRINT_DURATION) -		trace_seq_printf(s, " |"); -	ret = trace_seq_printf(s, "\n"); +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); +	if (ret != TRACE_TYPE_HANDLED) +		return ret; + +	ret = trace_seq_putc(s, '\n');  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -702,13 +748,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)  		len += strlen(nsecs_str);  	} -	ret = trace_seq_printf(s, " us "); +	ret = trace_seq_puts(s, " us ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE;  	/* Print remaining spaces to fit the row's width */  	for (i = len; i < 7; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -716,15 +762,55 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)  }  static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +print_graph_duration(unsigned long long duration, struct trace_seq *s, +		     u32 flags)  { -	int ret; +	int ret = -1; + +	if (!(flags & TRACE_GRAPH_PRINT_DURATION) || +	    !(trace_flags & TRACE_ITER_CONTEXT_INFO)) +			return TRACE_TYPE_HANDLED; + +	/* No real adata, just filling the column with spaces */ +	switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { +	case FLAGS_FILL_FULL: +		ret = trace_seq_puts(s, "              |  "); +		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +	case FLAGS_FILL_START: +		ret = trace_seq_puts(s, "  "); +		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +	case FLAGS_FILL_END: +		ret = trace_seq_puts(s, " |"); +		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +	} + +	/* Signal a overhead of time execution to the output */ +	if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { +		/* Duration exceeded 100 msecs */ +		if (duration > 100000ULL) +			ret = trace_seq_puts(s, "! "); +		/* Duration exceeded 10 msecs */ +		else if (duration > 10000ULL) +			ret = trace_seq_puts(s, "+ "); +	} + +	/* +	 * The -1 means we either did not exceed the duration tresholds +	 * or we dont want to print out the overhead. Either way we need +	 * to fill out the space. +	 */ +	if (ret == -1) +		ret = trace_seq_puts(s, "  "); + +	/* Catching here any failure happenned above */ +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE;  	ret = trace_print_graph_duration(duration, s);  	if (ret != TRACE_TYPE_HANDLED)  		return ret; -	ret = trace_seq_printf(s, "|  "); +	ret = trace_seq_puts(s, "|  ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -767,21 +853,14 @@ print_graph_entry_leaf(struct trace_iterator *iter,  			cpu_data->enter_funcs[call->depth] = 0;  	} -	/* Overhead */ -	ret = print_graph_overhead(duration, s, flags); -	if (!ret) +	/* Overhead and duration */ +	ret = print_graph_duration(duration, s, flags); +	if (ret == TRACE_TYPE_PARTIAL_LINE)  		return TRACE_TYPE_PARTIAL_LINE; -	/* Duration */ -	if (flags & TRACE_GRAPH_PRINT_DURATION) { -		ret = print_graph_duration(duration, s); -		if (ret == TRACE_TYPE_PARTIAL_LINE) -			return TRACE_TYPE_PARTIAL_LINE; -	} -  	/* Function */  	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -815,21 +894,14 @@ print_graph_entry_nested(struct trace_iterator *iter,  			cpu_data->enter_funcs[call->depth] = call->func;  	} -	/* No overhead */ -	ret = print_graph_overhead(-1, s, flags); -	if (!ret) -		return TRACE_TYPE_PARTIAL_LINE; -  	/* No time */ -	if (flags & TRACE_GRAPH_PRINT_DURATION) { -		ret = trace_seq_printf(s, "            |  "); -		if (!ret) -			return TRACE_TYPE_PARTIAL_LINE; -	} +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); +	if (ret != TRACE_TYPE_HANDLED) +		return ret;  	/* Function */  	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -865,6 +937,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,  			return TRACE_TYPE_PARTIAL_LINE;  	} +	if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) +		return 0; +  	/* Absolute time */  	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {  		ret = print_graph_abs_time(iter->ts, s); @@ -885,7 +960,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,  		if (ret == TRACE_TYPE_PARTIAL_LINE)  			return TRACE_TYPE_PARTIAL_LINE; -		ret = trace_seq_printf(s, " | "); +		ret = trace_seq_puts(s, " | ");  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -905,7 +980,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,   *   * returns 1 if   *  - we are inside irq code - *  - we just extered irq code + *  - we just entered irq code   *   * retunns 0 if   *  - funcgraph-interrupts option is set @@ -1078,21 +1153,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,  	if (print_graph_prologue(iter, s, 0, 0, flags))  		return TRACE_TYPE_PARTIAL_LINE; -	/* Overhead */ -	ret = print_graph_overhead(duration, s, flags); -	if (!ret) +	/* Overhead and duration */ +	ret = print_graph_duration(duration, s, flags); +	if (ret == TRACE_TYPE_PARTIAL_LINE)  		return TRACE_TYPE_PARTIAL_LINE; -	/* Duration */ -	if (flags & TRACE_GRAPH_PRINT_DURATION) { -		ret = print_graph_duration(duration, s); -		if (ret == TRACE_TYPE_PARTIAL_LINE) -			return TRACE_TYPE_PARTIAL_LINE; -	} -  	/* Closing brace */  	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -1101,10 +1169,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,  	 * If the return function does not have a matching entry,  	 * then the entry was lost. Instead of just printing  	 * the '}' and letting the user guess what function this -	 * belongs to, write out the function name. +	 * belongs to, write out the function name. Always do +	 * that if the funcgraph-tail option is enabled.  	 */ -	if (func_match) { -		ret = trace_seq_printf(s, "}\n"); +	if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { +		ret = trace_seq_puts(s, "}\n");  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} else { @@ -1146,28 +1215,21 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  	if (print_graph_prologue(iter, s, 0, 0, flags))  		return TRACE_TYPE_PARTIAL_LINE; -	/* No overhead */ -	ret = print_graph_overhead(-1, s, flags); -	if (!ret) -		return TRACE_TYPE_PARTIAL_LINE; -  	/* No time */ -	if (flags & TRACE_GRAPH_PRINT_DURATION) { -		ret = trace_seq_printf(s, "            |  "); -		if (!ret) -			return TRACE_TYPE_PARTIAL_LINE; -	} +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); +	if (ret != TRACE_TYPE_HANDLED) +		return ret;  	/* Indentation */  	if (depth > 0)  		for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { -			ret = trace_seq_printf(s, " "); +			ret = trace_seq_putc(s, ' ');  			if (!ret)  				return TRACE_TYPE_PARTIAL_LINE;  		}  	/* The comment */ -	ret = trace_seq_printf(s, "/* "); +	ret = trace_seq_puts(s, "/* ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -1198,7 +1260,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  		s->len--;  	} -	ret = trace_seq_printf(s, " */\n"); +	ret = trace_seq_puts(s, " */\n");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -1207,7 +1269,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  enum print_line_t -__print_graph_function_flags(struct trace_iterator *iter, u32 flags) +print_graph_function_flags(struct trace_iterator *iter, u32 flags)  {  	struct ftrace_graph_ent_entry *field;  	struct fgraph_data *data = iter->private; @@ -1270,18 +1332,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)  static enum print_line_t  print_graph_function(struct trace_iterator *iter)  { -	return __print_graph_function_flags(iter, tracer_flags.val); -} - -enum print_line_t print_graph_function_flags(struct trace_iterator *iter, -					     u32 flags) -{ -	if (trace_flags & TRACE_ITER_LATENCY_FMT) -		flags |= TRACE_GRAPH_PRINT_DURATION; -	else -		flags |= TRACE_GRAPH_PRINT_ABS_TIME; - -	return __print_graph_function_flags(iter, flags); +	return print_graph_function_flags(iter, tracer_flags.val);  }  static enum print_line_t @@ -1309,8 +1360,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)  	seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces);  	seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);  	seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces); -	seq_printf(s, "#%.*s||| / _-=> lock-depth      \n", size, spaces); -	seq_printf(s, "#%.*s|||| /                     \n", size, spaces); +	seq_printf(s, "#%.*s||| /                      \n", size, spaces);  }  static void __print_graph_headers_flags(struct seq_file *s, u32 flags) @@ -1329,7 +1379,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)  	if (flags & TRACE_GRAPH_PRINT_PROC)  		seq_printf(s, "  TASK/PID       ");  	if (lat) -		seq_printf(s, "|||||"); +		seq_printf(s, "||||");  	if (flags & TRACE_GRAPH_PRINT_DURATION)  		seq_printf(s, "  DURATION   ");  	seq_printf(s, "               FUNCTION CALLS\n"); @@ -1343,7 +1393,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)  	if (flags & TRACE_GRAPH_PRINT_PROC)  		seq_printf(s, "   |    |        ");  	if (lat) -		seq_printf(s, "|||||"); +		seq_printf(s, "||||");  	if (flags & TRACE_GRAPH_PRINT_DURATION)  		seq_printf(s, "   |   |      ");  	seq_printf(s, "               |   |   |   |\n"); @@ -1358,15 +1408,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)  {  	struct trace_iterator *iter = s->private; +	if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) +		return; +  	if (trace_flags & TRACE_ITER_LATENCY_FMT) {  		/* print nothing if the buffers are empty */  		if (trace_empty(iter))  			return;  		print_trace_header(s, iter); -		flags |= TRACE_GRAPH_PRINT_DURATION; -	} else -		flags |= TRACE_GRAPH_PRINT_ABS_TIME; +	}  	__print_graph_headers_flags(s, flags);  } @@ -1419,7 +1470,8 @@ void graph_trace_close(struct trace_iterator *iter)  	}  } -static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	if (bit == TRACE_GRAPH_PRINT_IRQS)  		ftrace_graph_skip_irqs = !set; @@ -1441,13 +1493,12 @@ static struct trace_event graph_trace_ret_event = {  	.funcs		= &graph_functions  }; -static struct tracer graph_trace __read_mostly = { +static struct tracer graph_trace __tracer_data = {  	.name		= "function_graph",  	.open		= graph_trace_open,  	.pipe_open	= graph_trace_open,  	.close		= graph_trace_close,  	.pipe_close	= graph_trace_close, -	.wait_pipe	= poll_wait_pipe,  	.init		= graph_trace_init,  	.reset		= graph_trace_reset,  	.print_line	= print_graph_function, @@ -1459,6 +1510,59 @@ static struct tracer graph_trace __read_mostly = {  #endif  }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, +		  loff_t *ppos) +{ +	unsigned long val; +	int ret; + +	ret = kstrtoul_from_user(ubuf, cnt, 10, &val); +	if (ret) +		return ret; + +	max_depth = val; + +	*ppos += cnt; + +	return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, +		 loff_t *ppos) +{ +	char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ +	int n; + +	n = sprintf(buf, "%d\n", max_depth); + +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { +	.open		= tracing_open_generic, +	.write		= graph_depth_write, +	.read		= graph_depth_read, +	.llseek		= generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ +	struct dentry *d_tracer; + +	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; + +	trace_create_file("max_graph_depth", 0644, d_tracer, +			  NULL, &graph_depth_fops); + +	return 0; +} +fs_initcall(init_graph_debugfs); +  static __init int init_graph_trace(void)  {  	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); @@ -1476,4 +1580,4 @@ static __init int init_graph_trace(void)  	return register_tracer(&graph_trace);  } -device_initcall(init_graph_trace); +core_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b88..9bb104f748d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -7,7 +7,7 @@   * From code in the latency_tracer, that is:   *   *  Copyright (C) 2004-2006 Ingo Molnar - *  Copyright (C) 2004 William Lee Irwin III + *  Copyright (C) 2004 Nadia Yvette Chambers   */  #include <linux/kallsyms.h>  #include <linux/debugfs.h> @@ -23,7 +23,7 @@ static int				tracer_enabled __read_mostly;  static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock);  enum {  	TRACER_IRQS_OFF		= (1 << 1), @@ -32,7 +32,8 @@ enum {  static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; +static bool function_enabled;  static void stop_irqsoff_tracer(struct trace_array *tr, int graph);  static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -80,7 +81,7 @@ static struct tracer_flags tracer_flags = {   * skip the latency if the sequence has changed - some other section   * did a maximum and could disturb our measurement with serial console   * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesnt + * and what happens together happens separately as well, so this doesn't   * decrease the validity of the maximum found:   */  static __cacheline_aligned_in_smp	unsigned long max_sequence; @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,  	if (!irqs_disabled_flags(*flags))  		return 0; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (likely(disabled == 1)) @@ -136,7 +137,8 @@ static int func_prolog_dec(struct trace_array *tr,   * irqsoff uses its own tracer function to keep the overhead down:   */  static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, +		    struct ftrace_ops *op, struct pt_regs *pt_regs)  {  	struct trace_array *tr = irqsoff_trace;  	struct trace_array_cpu *data; @@ -149,15 +151,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)  	atomic_dec(&data->disabled);  } - -static struct ftrace_ops trace_ops __read_mostly = -{ -	.func = irqsoff_tracer_call, -};  #endif /* CONFIG_FUNCTION_TRACER */  #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	int cpu; @@ -172,8 +170,8 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)  	for_each_possible_cpu(cpu)  		per_cpu(tracing_cpu, cpu) = 0; -	tracing_max_latency = 0; -	tracing_reset_online_cpus(irqsoff_trace); +	tr->max_latency = 0; +	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);  	return start_irqsoff_tracer(irqsoff_trace, set);  } @@ -225,7 +223,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)  }  #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ -			    TRACE_GRAPH_PRINT_PROC) +			    TRACE_GRAPH_PRINT_PROC | \ +			    TRACE_GRAPH_PRINT_ABS_TIME | \ +			    TRACE_GRAPH_PRINT_DURATION)  static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)  { @@ -261,7 +261,8 @@ __trace_function(struct trace_array *tr,  #else  #define __trace_function trace_function -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	return -EINVAL;  } @@ -277,21 +278,32 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)  }  static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_print_header(struct seq_file *s) { }  static void irqsoff_trace_open(struct trace_iterator *iter) { }  static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ +	trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ +	trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  /*   * Should this new latency be reported/recorded?   */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta)  {  	if (tracing_thresh) {  		if (delta < tracing_thresh)  			return 0;  	} else { -		if (delta <= tracing_max_latency) +		if (delta <= tr->max_latency)  			return 0;  	}  	return 1; @@ -315,13 +327,13 @@ check_critical_timing(struct trace_array *tr,  	pc = preempt_count(); -	if (!report_latency(delta)) +	if (!report_latency(tr, delta))  		goto out; -	spin_lock_irqsave(&max_trace_lock, flags); +	raw_spin_lock_irqsave(&max_trace_lock, flags);  	/* check if we are still the max latency */ -	if (!report_latency(delta)) +	if (!report_latency(tr, delta))  		goto out_unlock;  	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -334,14 +346,14 @@ check_critical_timing(struct trace_array *tr,  	data->critical_end = parent_ip;  	if (likely(!is_tracing_stopped())) { -		tracing_max_latency = delta; +		tr->max_latency = delta;  		update_max_tr_single(tr, current, cpu);  	}  	max_sequence++;  out_unlock: -	spin_unlock_irqrestore(&max_trace_lock, flags); +	raw_spin_unlock_irqrestore(&max_trace_lock, flags);  out:  	data->critical_sequence = max_sequence; @@ -357,7 +369,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	struct trace_array_cpu *data;  	unsigned long flags; -	if (likely(!tracer_enabled)) +	if (!tracer_enabled || !tracing_is_enabled())  		return;  	cpu = raw_smp_processor_id(); @@ -365,7 +377,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	if (per_cpu(tracing_cpu, cpu))  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) || atomic_read(&data->disabled))  		return; @@ -400,10 +412,10 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)  	else  		return; -	if (!tracer_enabled) +	if (!tracer_enabled || !tracing_is_enabled())  		return; -	data = tr->data[cpu]; +	data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	if (unlikely(!data) ||  	    !data->critical_start || atomic_read(&data->disabled)) @@ -453,14 +465,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)   * Stubs:   */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} -  void trace_softirqs_on(unsigned long ip)  {  } @@ -490,14 +494,14 @@ void trace_hardirqs_off(void)  }  EXPORT_SYMBOL(trace_hardirqs_off); -void trace_hardirqs_on_caller(unsigned long caller_addr) +__visible void trace_hardirqs_on_caller(unsigned long caller_addr)  {  	if (!preempt_trace() && irq_trace())  		stop_critical_timing(CALLER_ADDR0, caller_addr);  }  EXPORT_SYMBOL(trace_hardirqs_on_caller); -void trace_hardirqs_off_caller(unsigned long caller_addr) +__visible void trace_hardirqs_off_caller(unsigned long caller_addr)  {  	if (!preempt_trace() && irq_trace())  		start_critical_timing(CALLER_ADDR0, caller_addr); @@ -510,26 +514,73 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);  #ifdef CONFIG_PREEMPT_TRACER  void trace_preempt_on(unsigned long a0, unsigned long a1)  { -	if (preempt_trace()) +	if (preempt_trace() && !irq_trace())  		stop_critical_timing(a0, a1);  }  void trace_preempt_off(unsigned long a0, unsigned long a1)  { -	if (preempt_trace()) +	if (preempt_trace() && !irq_trace())  		start_critical_timing(a0, a1);  }  #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set)  { -	int ret = 0; +	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&irqsoff_graph_return,  					    &irqsoff_graph_entry); +	else +		ret = register_ftrace_function(tr->ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_irqsoff_function(struct trace_array *tr, int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(tr->ops); + +	function_enabled = false; +} + +static void irqsoff_function_set(struct trace_array *tr, int set) +{ +	if (set) +		register_irqsoff_function(tr, is_graph(), 1); +	else +		unregister_irqsoff_function(tr, is_graph()); +} + +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +{ +	struct tracer *tracer = tr->current_trace; + +	if (mask & TRACE_ITER_FUNCTION) +		irqsoff_function_set(tr, set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ +	int ret; + +	ret = register_irqsoff_function(tr, graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -543,33 +594,51 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_irqsoff_function(tr, graph);  } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr)  { -	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; -	trace_flags |= TRACE_ITER_LATENCY_FMT; +	if (irqsoff_busy) +		return -EBUSY; + +	save_flags = trace_flags; + +	/* non overwrite screws up the latency tracers */ +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); -	tracing_max_latency = 0; +	tr->max_latency = 0;  	irqsoff_trace = tr;  	/* make sure that the tracer is visible */  	smp_wmb(); -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer); -	if (start_irqsoff_tracer(tr, is_graph())) +	ftrace_init_array_ops(tr, irqsoff_tracer_call); + +	/* Only toplevel instance supports graph tracing */ +	if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && +				      is_graph())))  		printk(KERN_ERR "failed to start irqsoff tracer\n"); + +	irqsoff_busy = true; +	return 0;  }  static void irqsoff_tracer_reset(struct trace_array *tr)  { +	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; +	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; +  	stop_irqsoff_tracer(tr, is_graph()); -	if (!save_lat_flag) -		trace_flags &= ~TRACE_ITER_LATENCY_FMT; +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); +	ftrace_reset_array_ops(tr); + +	irqsoff_busy = false;  }  static void irqsoff_tracer_start(struct trace_array *tr) @@ -587,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)  {  	trace_type = TRACER_IRQS_OFF; -	__irqsoff_tracer_init(tr); -	return 0; +	return __irqsoff_tracer_init(tr);  }  static struct tracer irqsoff_tracer __read_mostly =  { @@ -597,17 +665,19 @@ static struct tracer irqsoff_tracer __read_mostly =  	.reset		= irqsoff_tracer_reset,  	.start		= irqsoff_tracer_start,  	.stop		= irqsoff_tracer_stop, -	.print_max	= 1, +	.print_max	= true,  	.print_header   = irqsoff_print_header,  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_irqsoff,  #endif  	.open           = irqsoff_trace_open,  	.close          = irqsoff_trace_close, -	.use_max_tr	= 1, +	.allow_instances = true, +	.use_max_tr	= true,  };  # define register_irqsoff(trace) register_tracer(&trace)  #else @@ -619,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)  {  	trace_type = TRACER_PREEMPT_OFF; -	__irqsoff_tracer_init(tr); -	return 0; +	return __irqsoff_tracer_init(tr);  }  static struct tracer preemptoff_tracer __read_mostly = @@ -630,17 +699,19 @@ static struct tracer preemptoff_tracer __read_mostly =  	.reset		= irqsoff_tracer_reset,  	.start		= irqsoff_tracer_start,  	.stop		= irqsoff_tracer_stop, -	.print_max	= 1, +	.print_max	= true,  	.print_header   = irqsoff_print_header,  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptoff,  #endif  	.open		= irqsoff_trace_open,  	.close		= irqsoff_trace_close, -	.use_max_tr	= 1, +	.allow_instances = true, +	.use_max_tr	= true,  };  # define register_preemptoff(trace) register_tracer(&trace)  #else @@ -654,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)  {  	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; -	__irqsoff_tracer_init(tr); -	return 0; +	return __irqsoff_tracer_init(tr);  }  static struct tracer preemptirqsoff_tracer __read_mostly = @@ -665,17 +735,19 @@ static struct tracer preemptirqsoff_tracer __read_mostly =  	.reset		= irqsoff_tracer_reset,  	.start		= irqsoff_tracer_start,  	.stop		= irqsoff_tracer_stop, -	.print_max	= 1, +	.print_max	= true,  	.print_header   = irqsoff_print_header,  	.print_line     = irqsoff_print_line,  	.flags		= &tracer_flags,  	.set_flag	= irqsoff_set_flag, +	.flag_changed	= irqsoff_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_preemptirqsoff,  #endif  	.open		= irqsoff_trace_open,  	.close		= irqsoff_trace_close, -	.use_max_tr	= 1, +	.allow_instances = true, +	.use_max_tr	= true,  };  # define register_preemptirqsoff(trace) register_tracer(&trace) @@ -691,4 +763,4 @@ __init static int init_irqsoff_tracer(void)  	return 0;  } -device_initcall(init_irqsoff_tracer); +core_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b..bd90e1b0608 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	trace_init_global_iter(&iter);  	for_each_tracing_cpu(cpu) { -		atomic_inc(&iter.tr->data[cpu]->disabled); +		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)  	iter.iter_flags |= TRACE_FILE_LAT_FMT;  	iter.pos = -1; -	if (cpu_file == TRACE_PIPE_ALL_CPU) { +	if (cpu_file == RING_BUFFER_ALL_CPUS) {  		for_each_tracing_cpu(cpu) {  			iter.buffer_iter[cpu] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);  			ring_buffer_read_start(iter.buffer_iter[cpu]);  			tracing_iter_reset(&iter, cpu);  		}  	} else {  		iter.cpu_file = cpu_file;  		iter.buffer_iter[cpu_file] = -			ring_buffer_read_prepare(iter.tr->buffer, cpu_file); +			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);  		ring_buffer_read_start(iter.buffer_iter[cpu_file]);  		tracing_iter_reset(&iter, cpu_file);  	} @@ -83,7 +83,7 @@ out:  	trace_flags = old_userobj;  	for_each_tracing_cpu(cpu) { -		atomic_dec(&iter.tr->data[cpu]->disabled); +		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);  	}  	for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)  		    !cpu_online(cpu_file))  			return KDB_BADINT;  	} else { -		cpu_file = TRACE_PIPE_ALL_CPU; +		cpu_file = RING_BUFFER_ALL_CPUS;  	}  	kdb_trap_printk++; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b..282f6e4e553 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -19,186 +19,134 @@  #include <linux/module.h>  #include <linux/uaccess.h> -#include <linux/kprobes.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/smp.h> -#include <linux/debugfs.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/stringify.h> -#include <linux/limits.h> -#include <asm/bitsperlong.h> - -#include "trace.h" -#include "trace_output.h" - -#define MAX_TRACE_ARGS 128 -#define MAX_ARGSTR_LEN 63 -#define MAX_EVENT_NAME_LEN 64 -#define MAX_STRING_SIZE PATH_MAX + +#include "trace_probe.h" +  #define KPROBE_EVENT_SYSTEM "kprobes" -/* Reserved field names */ -#define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_RETIP "__probe_ret_ip" -#define FIELD_STRING_FUNC "__probe_func" - -const char *reserved_field_names[] = { -	"common_type", -	"common_flags", -	"common_preempt_count", -	"common_pid", -	"common_tgid", -	"common_lock_depth", -	FIELD_STRING_IP, -	FIELD_STRING_RETIP, -	FIELD_STRING_FUNC, +/** + * Kprobe event core functions + */ +struct trace_kprobe { +	struct list_head	list; +	struct kretprobe	rp;	/* Use rp.kp for kprobe use */ +	unsigned long 		nhit; +	const char		*symbol;	/* symbol name */ +	struct trace_probe	tp;  }; -/* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, -				 void *); -#define PRINT_TYPE_FUNC_NAME(type)	print_type_##type -#define PRINT_TYPE_FMT_NAME(type)	print_type_format_##type - -/* Printing  in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)			\ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,	\ -						const char *name,	\ -						void *data, void *ent)\ -{									\ -	return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ -}									\ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs)	\ -	(((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl)	((u32)(dl) >> 16) -#define get_rloc_offs(dl)	((u32)(dl) & 0xffff) - -static inline void *get_rloc_data(u32 *dl) +#define SIZEOF_TRACE_KPROBE(n)				\ +	(offsetof(struct trace_kprobe, tp.args) +	\ +	(sizeof(struct probe_arg) * (n))) + + +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)  { -	return (u8 *)dl + get_rloc_offs(*dl); +	return tk->rp.handler != NULL;  } -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)  { -	return (u8 *)ent + get_rloc_offs(*dl); +	return tk->symbol ? tk->symbol : "unknown";  } -/* - * Convert data_rloc to data_loc: - *  data_rloc stores the offset from data_rloc itself, but data_loc - *  stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs)	((u32)(dl) + (offs)) +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +{ +	return tk->rp.kp.offset; +} -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) +{ +	return !!(kprobe_gone(&tk->rp.kp)); +} -/* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, -						  const char *name, -						  void *data, void *ent) +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, +						 struct module *mod)  { -	int len = *(u32 *)data >> 16; +	int len = strlen(mod->name); +	const char *name = trace_kprobe_symbol(tk); +	return strncmp(mod->name, name, len) == 0 && name[len] == ':'; +} -	if (!len) -		return trace_seq_printf(s, " %s=(fault)", name); -	else -		return trace_seq_printf(s, " %s=\"%s\"", name, -					(const char *)get_loc_data(data, ent)); +static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +{ +	return !!strchr(trace_kprobe_symbol(tk), ':');  } -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -/* Data fetch function type */ -typedef	void (*fetch_func_t)(struct pt_regs *, void *, void *); +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); -struct fetch_param { -	fetch_func_t	fn; -	void *data; +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, +				struct pt_regs *regs); + +/* Memory fetching by symbol */ +struct symbol_cache { +	char		*symbol; +	long		offset; +	unsigned long	addr;  }; -static __kprobes void call_fetch(struct fetch_param *fprm, -				 struct pt_regs *regs, void *dest) +unsigned long update_symbol_cache(struct symbol_cache *sc)  { -	return fprm->fn(regs, fprm->data, dest); +	sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + +	if (sc->addr) +		sc->addr += sc->offset; + +	return sc->addr;  } -#define FETCH_FUNC_NAME(method, type)	fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8)		\ -DEFINE_FETCH_##method(u16)		\ -DEFINE_FETCH_##method(u32)		\ -DEFINE_FETCH_##method(u64) - -#define CHECK_FETCH_FUNCS(method, fn)			\ -	(((FETCH_FUNC_NAME(method, u8) == fn) ||	\ -	  (FETCH_FUNC_NAME(method, u16) == fn) ||	\ -	  (FETCH_FUNC_NAME(method, u32) == fn) ||	\ -	  (FETCH_FUNC_NAME(method, u64) == fn) ||	\ -	  (FETCH_FUNC_NAME(method, string) == fn) ||	\ -	  (FETCH_FUNC_NAME(method, string_size) == fn)) \ -	 && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type)						\ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,	\ -					void *offset, void *dest)	\ -{									\ -	*(type *)dest = (type)regs_get_register(regs,			\ -				(unsigned int)((unsigned long)offset));	\ +void free_symbol_cache(struct symbol_cache *sc) +{ +	kfree(sc->symbol); +	kfree(sc);  } -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ +	struct symbol_cache *sc; + +	if (!sym || strlen(sym) == 0) +		return NULL; + +	sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); +	if (!sc) +		return NULL; + +	sc->symbol = kstrdup(sym, GFP_KERNEL); +	if (!sc->symbol) { +		kfree(sc); +		return NULL; +	} +	sc->offset = offset; +	update_symbol_cache(sc); + +	return sc; +} + +/* + * Kprobes-specific fetch functions + */  #define DEFINE_FETCH_stack(type)					\ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,		\  					  void *offset, void *dest)	\  {									\  	*(type *)dest = (type)regs_get_kernel_stack_nth(regs,		\  				(unsigned int)((unsigned long)offset));	\ -} +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); +  DEFINE_BASIC_FETCH_FUNCS(stack)  /* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_retval(type)					\ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ -					  void *dummy, void *dest)	\ -{									\ -	*(type *)dest = (type)regs_return_value(regs);			\ -} -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL +#define fetch_stack_string	NULL +#define fetch_stack_string_size	NULL  #define DEFINE_FETCH_memory(type)					\ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,		\  					  void *addr, void *dest)	\  {									\  	type retval;							\ @@ -206,31 +154,37 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\  		*(type *)dest = 0;					\  	else								\  		*(type *)dest = retval;					\ -} +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); +  DEFINE_BASIC_FETCH_FUNCS(memory)  /*   * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max   * length and relative data location.   */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, -						      void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, +					    void *addr, void *dest)  {  	long ret;  	int maxlen = get_rloc_len(*(u32 *)dest);  	u8 *dst = get_rloc_data(dest);  	u8 *src = addr;  	mm_segment_t old_fs = get_fs(); +  	if (!maxlen)  		return; +  	/*  	 * Try to get string again, since the string can be changed while  	 * probing.  	 */  	set_fs(KERNEL_DS);  	pagefault_disable(); +  	do  		ret = __copy_from_user_inatomic(dst++, src++, 1);  	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); +  	dst[-1] = '\0';  	pagefault_enable();  	set_fs(old_fs); @@ -238,24 +192,30 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,  	if (ret < 0) {	/* Failed to fetch string */  		((u8 *)get_rloc_data(dest))[0] = '\0';  		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); -	} else +	} else {  		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,  					      get_rloc_offs(*(u32 *)dest)); +	}  } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); +  /* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, -							void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, +						 void *addr, void *dest)  { +	mm_segment_t old_fs;  	int ret, len = 0;  	u8 c; -	mm_segment_t old_fs = get_fs(); +	old_fs = get_fs();  	set_fs(KERNEL_DS);  	pagefault_disable(); +  	do {  		ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);  		len++;  	} while (c && ret == 0 && len < MAX_STRING_SIZE); +  	pagefault_enable();  	set_fs(old_fs); @@ -264,149 +224,33 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,  	else  		*(u32 *)dest = len;  } - -/* Memory fetching by symbol */ -struct symbol_cache { -	char *symbol; -	long offset; -	unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ -	sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); -	if (sc->addr) -		sc->addr += sc->offset; -	return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ -	kfree(sc->symbol); -	kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ -	struct symbol_cache *sc; - -	if (!sym || strlen(sym) == 0) -		return NULL; -	sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); -	if (!sc) -		return NULL; - -	sc->symbol = kstrdup(sym, GFP_KERNEL); -	if (!sc->symbol) { -		kfree(sc); -		return NULL; -	} -	sc->offset = offset; - -	update_symbol_cache(sc); -	return sc; -} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));  #define DEFINE_FETCH_symbol(type)					\ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ -					  void *data, void *dest)	\ +void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\  {									\  	struct symbol_cache *sc = data;					\  	if (sc->addr)							\  		fetch_memory_##type(regs, (void *)sc->addr, dest);	\  	else								\  		*(type *)dest = 0;					\ -} +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); +  DEFINE_BASIC_FETCH_FUNCS(symbol)  DEFINE_FETCH_symbol(string)  DEFINE_FETCH_symbol(string_size) -/* Dereference memory access function */ -struct deref_fetch_param { -	struct fetch_param orig; -	long offset; -}; - -#define DEFINE_FETCH_deref(type)					\ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ -					    void *data, void *dest)	\ -{									\ -	struct deref_fetch_param *dprm = data;				\ -	unsigned long addr;						\ -	call_fetch(&dprm->orig, regs, &addr);				\ -	if (addr) {							\ -		addr += dprm->offset;					\ -		fetch_memory_##type(regs, (void *)addr, dest);		\ -	} else								\ -		*(type *)dest = 0;					\ -} -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) - -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) -{ -	if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) -		free_deref_fetch_param(data->orig.data); -	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) -		free_symbol_cache(data->orig.data); -	kfree(data); -} - -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -/* Fetch types */ -enum { -	FETCH_MTD_reg = 0, -	FETCH_MTD_stack, -	FETCH_MTD_retval, -	FETCH_MTD_memory, -	FETCH_MTD_symbol, -	FETCH_MTD_deref, -	FETCH_MTD_END, -}; - -#define ASSIGN_FETCH_FUNC(method, type)	\ -	[FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype)	\ -	{.name = _name,				\ -	 .size = _size,					\ -	 .is_signed = sign,				\ -	 .print = PRINT_TYPE_FUNC_NAME(ptype),		\ -	 .fmt = PRINT_TYPE_FMT_NAME(ptype),		\ -	 .fmttype = _fmttype,				\ -	 .fetch = {					\ -ASSIGN_FETCH_FUNC(reg, ftype),				\ -ASSIGN_FETCH_FUNC(stack, ftype),			\ -ASSIGN_FETCH_FUNC(retval, ftype),			\ -ASSIGN_FETCH_FUNC(memory, ftype),			\ -ASSIGN_FETCH_FUNC(symbol, ftype),			\ -ASSIGN_FETCH_FUNC(deref, ftype),			\ -	  }						\ -	} - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)			\ -	__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 +/* kprobes don't support file_offset fetch methods */ +#define fetch_file_offset_u8		NULL +#define fetch_file_offset_u16		NULL +#define fetch_file_offset_u32		NULL +#define fetch_file_offset_u64		NULL +#define fetch_file_offset_string	NULL +#define fetch_file_offset_string_size	NULL  /* Fetch type information table */ -static const struct fetch_type { -	const char	*name;		/* Name of type */ -	size_t		size;		/* Byte size of type */ -	int		is_signed;	/* Signed flag */ -	print_type_func_t	print;	/* Print functions */ -	const char	*fmt;		/* Fromat string */ -	const char	*fmttype;	/* Name in format file */ -	/* Fetch functions */ -	fetch_func_t	fetch[FETCH_MTD_END]; -} fetch_type_table[] = { +const struct fetch_type kprobes_fetch_type_table[] = {  	/* Special types */  	[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,  					sizeof(u32), 1, "__data_loc char[]"), @@ -421,149 +265,49 @@ static const struct fetch_type {  	ASSIGN_FETCH_TYPE(s16, u16, 1),  	ASSIGN_FETCH_TYPE(s32, u32, 1),  	ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) -{ -	int i; -	if (!type) -		type = DEFAULT_FETCH_TYPE_STR; - -	for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) -		if (strcmp(type, fetch_type_table[i].name) == 0) -			return &fetch_type_table[i]; -	return NULL; -} - -/* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, -					  void *dummy, void *dest) -{ -	*(unsigned long *)dest = kernel_stack_pointer(regs); -} - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, -					    fetch_func_t orig_fn) -{ -	int i; - -	if (type != &fetch_type_table[FETCH_TYPE_STRING]) -		return NULL;	/* Only string type needs size function */ -	for (i = 0; i < FETCH_MTD_END; i++) -		if (type->fetch[i] == orig_fn) -			return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; - -	WARN_ON(1);	/* This should not happen */ -	return NULL; -} - -/** - * Kprobe event core functions - */ - -struct probe_arg { -	struct fetch_param	fetch; -	struct fetch_param	fetch_size; -	unsigned int		offset;	/* Offset from argument entry */ -	const char		*name;	/* Name of this argument */ -	const char		*comm;	/* Command of this argument */ -	const struct fetch_type	*type;	/* Type of this argument */ -}; - -/* Flags for trace_probe */ -#define TP_FLAG_TRACE	1 -#define TP_FLAG_PROFILE	2 - -struct trace_probe { -	struct list_head	list; -	struct kretprobe	rp;	/* Use rp.kp for kprobe use */ -	unsigned long 		nhit; -	unsigned int		flags;	/* For TP_FLAG_* */ -	const char		*symbol;	/* symbol name */ -	struct ftrace_event_class	class; -	struct ftrace_event_call	call; -	ssize_t			size;		/* trace entry size */ -	unsigned int		nr_args; -	struct probe_arg	args[]; +	ASSIGN_FETCH_TYPE_END  }; -#define SIZEOF_TRACE_PROBE(n)			\ -	(offsetof(struct trace_probe, args) +	\ -	(sizeof(struct probe_arg) * (n))) - - -static __kprobes int probe_is_return(struct trace_probe *tp) -{ -	return tp->rp.handler != NULL; -} - -static __kprobes const char *probe_symbol(struct trace_probe *tp) -{ -	return tp->symbol ? tp->symbol : "unknown"; -} - -static int register_probe_event(struct trace_probe *tp); -static void unregister_probe_event(struct trace_probe *tp); - -static DEFINE_MUTEX(probe_lock); -static LIST_HEAD(probe_list); - -static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); -static int kretprobe_dispatcher(struct kretprobe_instance *ri, -				struct pt_regs *regs); - -/* Check the name is good for event/group/fields */ -static int is_good_name(const char *name) -{ -	if (!isalpha(*name) && *name != '_') -		return 0; -	while (*++name != '\0') { -		if (!isalpha(*name) && !isdigit(*name) && *name != '_') -			return 0; -	} -	return 1; -} -  /*   * Allocate new trace_probe and initialize it (including kprobes).   */ -static struct trace_probe *alloc_trace_probe(const char *group, +static struct trace_kprobe *alloc_trace_kprobe(const char *group,  					     const char *event,  					     void *addr,  					     const char *symbol,  					     unsigned long offs, -					     int nargs, int is_return) +					     int nargs, bool is_return)  { -	struct trace_probe *tp; +	struct trace_kprobe *tk;  	int ret = -ENOMEM; -	tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); -	if (!tp) +	tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); +	if (!tk)  		return ERR_PTR(ret);  	if (symbol) { -		tp->symbol = kstrdup(symbol, GFP_KERNEL); -		if (!tp->symbol) +		tk->symbol = kstrdup(symbol, GFP_KERNEL); +		if (!tk->symbol)  			goto error; -		tp->rp.kp.symbol_name = tp->symbol; -		tp->rp.kp.offset = offs; +		tk->rp.kp.symbol_name = tk->symbol; +		tk->rp.kp.offset = offs;  	} else -		tp->rp.kp.addr = addr; +		tk->rp.kp.addr = addr;  	if (is_return) -		tp->rp.handler = kretprobe_dispatcher; +		tk->rp.handler = kretprobe_dispatcher;  	else -		tp->rp.kp.pre_handler = kprobe_dispatcher; +		tk->rp.kp.pre_handler = kprobe_dispatcher;  	if (!event || !is_good_name(event)) {  		ret = -EINVAL;  		goto error;  	} -	tp->call.class = &tp->class; -	tp->call.name = kstrdup(event, GFP_KERNEL); -	if (!tp->call.name) +	tk->tp.call.class = &tk->tp.class; +	tk->tp.call.name = kstrdup(event, GFP_KERNEL); +	if (!tk->tp.call.name)  		goto error;  	if (!group || !is_good_name(group)) { @@ -571,299 +315,283 @@ static struct trace_probe *alloc_trace_probe(const char *group,  		goto error;  	} -	tp->class.system = kstrdup(group, GFP_KERNEL); -	if (!tp->class.system) +	tk->tp.class.system = kstrdup(group, GFP_KERNEL); +	if (!tk->tp.class.system)  		goto error; -	INIT_LIST_HEAD(&tp->list); -	return tp; +	INIT_LIST_HEAD(&tk->list); +	INIT_LIST_HEAD(&tk->tp.files); +	return tk;  error: -	kfree(tp->call.name); -	kfree(tp->symbol); -	kfree(tp); +	kfree(tk->tp.call.name); +	kfree(tk->symbol); +	kfree(tk);  	return ERR_PTR(ret);  } -static void free_probe_arg(struct probe_arg *arg) -{ -	if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) -		free_deref_fetch_param(arg->fetch.data); -	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) -		free_symbol_cache(arg->fetch.data); -	kfree(arg->name); -	kfree(arg->comm); -} - -static void free_trace_probe(struct trace_probe *tp) +static void free_trace_kprobe(struct trace_kprobe *tk)  {  	int i; -	for (i = 0; i < tp->nr_args; i++) -		free_probe_arg(&tp->args[i]); +	for (i = 0; i < tk->tp.nr_args; i++) +		traceprobe_free_probe_arg(&tk->tp.args[i]); -	kfree(tp->call.class->system); -	kfree(tp->call.name); -	kfree(tp->symbol); -	kfree(tp); +	kfree(tk->tp.call.class->system); +	kfree(tk->tp.call.name); +	kfree(tk->symbol); +	kfree(tk);  } -static struct trace_probe *find_probe_event(const char *event, -					    const char *group) +static struct trace_kprobe *find_trace_kprobe(const char *event, +					      const char *group)  { -	struct trace_probe *tp; +	struct trace_kprobe *tk; -	list_for_each_entry(tp, &probe_list, list) -		if (strcmp(tp->call.name, event) == 0 && -		    strcmp(tp->call.class->system, group) == 0) -			return tp; +	list_for_each_entry(tk, &probe_list, list) +		if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && +		    strcmp(tk->tp.call.class->system, group) == 0) +			return tk;  	return NULL;  } -/* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int +enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)  { -	if (probe_is_return(tp)) -		unregister_kretprobe(&tp->rp); -	else -		unregister_kprobe(&tp->rp.kp); -	list_del(&tp->list); -	unregister_probe_event(tp); -} +	int ret = 0; -/* Register a trace_probe and probe_event */ -static int register_trace_probe(struct trace_probe *tp) -{ -	struct trace_probe *old_tp; -	int ret; +	if (file) { +		struct event_file_link *link; -	mutex_lock(&probe_lock); +		link = kmalloc(sizeof(*link), GFP_KERNEL); +		if (!link) { +			ret = -ENOMEM; +			goto out; +		} -	/* register as an event */ -	old_tp = find_probe_event(tp->call.name, tp->call.class->system); -	if (old_tp) { -		/* delete old event */ -		unregister_trace_probe(old_tp); -		free_trace_probe(old_tp); -	} -	ret = register_probe_event(tp); -	if (ret) { -		pr_warning("Failed to register probe event(%d)\n", ret); -		goto end; +		link->file = file; +		list_add_tail_rcu(&link->list, &tk->tp.files); + +		tk->tp.flags |= TP_FLAG_TRACE; +	} else +		tk->tp.flags |= TP_FLAG_PROFILE; + +	if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { +		if (trace_kprobe_is_return(tk)) +			ret = enable_kretprobe(&tk->rp); +		else +			ret = enable_kprobe(&tk->rp.kp);  	} + out: +	return ret; +} -	tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; -	if (probe_is_return(tp)) -		ret = register_kretprobe(&tp->rp); -	else -		ret = register_kprobe(&tp->rp.kp); +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int +disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) +{ +	struct event_file_link *link = NULL; +	int wait = 0; +	int ret = 0; -	if (ret) { -		pr_warning("Could not insert probe(%d)\n", ret); -		if (ret == -EILSEQ) { -			pr_warning("Probing address(0x%p) is not an " -				   "instruction boundary.\n", -				   tp->rp.kp.addr); +	if (file) { +		link = find_event_file_link(&tk->tp, file); +		if (!link) {  			ret = -EINVAL; +			goto out;  		} -		unregister_probe_event(tp); + +		list_del_rcu(&link->list); +		wait = 1; +		if (!list_empty(&tk->tp.files)) +			goto out; + +		tk->tp.flags &= ~TP_FLAG_TRACE;  	} else -		list_add_tail(&tp->list, &probe_list); -end: -	mutex_unlock(&probe_lock); +		tk->tp.flags &= ~TP_FLAG_PROFILE; + +	if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { +		if (trace_kprobe_is_return(tk)) +			disable_kretprobe(&tk->rp); +		else +			disable_kprobe(&tk->rp.kp); +		wait = 1; +	} + out: +	if (wait) { +		/* +		 * Synchronize with kprobe_trace_func/kretprobe_trace_func +		 * to ensure disabled (all running handlers are finished). +		 * This is not only for kfree(), but also the caller, +		 * trace_remove_event_call() supposes it for releasing +		 * event_call related objects, which will be accessed in +		 * the kprobe_trace_func/kretprobe_trace_func. +		 */ +		synchronize_sched(); +		kfree(link);	/* Ignored if link == NULL */ +	} +  	return ret;  } -/* Split symbol and offset. */ -static int split_symbol_offset(char *symbol, unsigned long *offset) +/* Internal register function - just handle k*probes and flags */ +static int __register_trace_kprobe(struct trace_kprobe *tk)  { -	char *tmp; -	int ret; +	int i, ret; -	if (!offset) +	if (trace_probe_is_registered(&tk->tp))  		return -EINVAL; -	tmp = strchr(symbol, '+'); -	if (tmp) { -		/* skip sign because strict_strtol doesn't accept '+' */ -		ret = strict_strtoul(tmp + 1, 0, offset); -		if (ret) -			return ret; -		*tmp = '\0'; -	} else -		*offset = 0; -	return 0; -} +	for (i = 0; i < tk->tp.nr_args; i++) +		traceprobe_update_arg(&tk->tp.args[i]); -#define PARAM_MAX_ARGS 16 -#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) +	/* Set/clear disabled flag according to tp->flag */ +	if (trace_probe_is_enabled(&tk->tp)) +		tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; +	else +		tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; -static int parse_probe_vars(char *arg, const struct fetch_type *t, -			    struct fetch_param *f, int is_return) -{ -	int ret = 0; -	unsigned long param; +	if (trace_kprobe_is_return(tk)) +		ret = register_kretprobe(&tk->rp); +	else +		ret = register_kprobe(&tk->rp.kp); -	if (strcmp(arg, "retval") == 0) { -		if (is_return) -			f->fn = t->fetch[FETCH_MTD_retval]; -		else -			ret = -EINVAL; -	} else if (strncmp(arg, "stack", 5) == 0) { -		if (arg[5] == '\0') { -			if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) -				f->fn = fetch_stack_address; -			else -				ret = -EINVAL; -		} else if (isdigit(arg[5])) { -			ret = strict_strtoul(arg + 5, 10, ¶m); -			if (ret || param > PARAM_MAX_STACK) -				ret = -EINVAL; -			else { -				f->fn = t->fetch[FETCH_MTD_stack]; -				f->data = (void *)param; -			} -		} else +	if (ret == 0) +		tk->tp.flags |= TP_FLAG_REGISTERED; +	else { +		pr_warning("Could not insert probe at %s+%lu: %d\n", +			   trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); +		if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { +			pr_warning("This probe might be able to register after" +				   "target module is loaded. Continue.\n"); +			ret = 0; +		} else if (ret == -EILSEQ) { +			pr_warning("Probing address(0x%p) is not an " +				   "instruction boundary.\n", +				   tk->rp.kp.addr);  			ret = -EINVAL; -	} else -		ret = -EINVAL; +		} +	} +  	return ret;  } -/* Recursive argument parser */ -static int __parse_probe_arg(char *arg, const struct fetch_type *t, -			     struct fetch_param *f, int is_return) +/* Internal unregister function - just handle k*probes and flags */ +static void __unregister_trace_kprobe(struct trace_kprobe *tk)  { -	int ret = 0; -	unsigned long param; -	long offset; -	char *tmp; - -	switch (arg[0]) { -	case '$': -		ret = parse_probe_vars(arg + 1, t, f, is_return); -		break; -	case '%':	/* named register */ -		ret = regs_query_register_offset(arg + 1); -		if (ret >= 0) { -			f->fn = t->fetch[FETCH_MTD_reg]; -			f->data = (void *)(unsigned long)ret; -			ret = 0; -		} -		break; -	case '@':	/* memory or symbol */ -		if (isdigit(arg[1])) { -			ret = strict_strtoul(arg + 1, 0, ¶m); -			if (ret) -				break; -			f->fn = t->fetch[FETCH_MTD_memory]; -			f->data = (void *)param; -		} else { -			ret = split_symbol_offset(arg + 1, &offset); -			if (ret) -				break; -			f->data = alloc_symbol_cache(arg + 1, offset); -			if (f->data) -				f->fn = t->fetch[FETCH_MTD_symbol]; -		} -		break; -	case '+':	/* deref memory */ -	case '-': -		tmp = strchr(arg, '('); -		if (!tmp) -			break; -		*tmp = '\0'; -		ret = strict_strtol(arg + 1, 0, &offset); -		if (ret) -			break; -		if (arg[0] == '-') -			offset = -offset; -		arg = tmp + 1; -		tmp = strrchr(arg, ')'); -		if (tmp) { -			struct deref_fetch_param *dprm; -			const struct fetch_type *t2 = find_fetch_type(NULL); -			*tmp = '\0'; -			dprm = kzalloc(sizeof(struct deref_fetch_param), -				       GFP_KERNEL); -			if (!dprm) -				return -ENOMEM; -			dprm->offset = offset; -			ret = __parse_probe_arg(arg, t2, &dprm->orig, -						is_return); -			if (ret) -				kfree(dprm); -			else { -				f->fn = t->fetch[FETCH_MTD_deref]; -				f->data = (void *)dprm; -			} -		} -		break; -	} -	if (!ret && !f->fn) {	/* Parsed, but do not find fetch method */ -		pr_info("%s type has no corresponding fetch method.\n", -			t->name); -		ret = -EINVAL; +	if (trace_probe_is_registered(&tk->tp)) { +		if (trace_kprobe_is_return(tk)) +			unregister_kretprobe(&tk->rp); +		else +			unregister_kprobe(&tk->rp.kp); +		tk->tp.flags &= ~TP_FLAG_REGISTERED; +		/* Cleanup kprobe for reuse */ +		if (tk->rp.kp.symbol_name) +			tk->rp.kp.addr = NULL;  	} -	return ret;  } -/* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct trace_probe *tp, -			   struct probe_arg *parg, int is_return) +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static int unregister_trace_kprobe(struct trace_kprobe *tk)  { -	const char *t; +	/* Enabled event can not be unregistered */ +	if (trace_probe_is_enabled(&tk->tp)) +		return -EBUSY; + +	/* Will fail if probe is being used by ftrace or perf */ +	if (unregister_kprobe_event(tk)) +		return -EBUSY; + +	__unregister_trace_kprobe(tk); +	list_del(&tk->list); + +	return 0; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_kprobe(struct trace_kprobe *tk) +{ +	struct trace_kprobe *old_tk;  	int ret; -	if (strlen(arg) > MAX_ARGSTR_LEN) { -		pr_info("Argument is too long.: %s\n",  arg); -		return -ENOSPC; -	} -	parg->comm = kstrdup(arg, GFP_KERNEL); -	if (!parg->comm) { -		pr_info("Failed to allocate memory for command '%s'.\n", arg); -		return -ENOMEM; -	} -	t = strchr(parg->comm, ':'); -	if (t) { -		arg[t - parg->comm] = '\0'; -		t++; -	} -	parg->type = find_fetch_type(t); -	if (!parg->type) { -		pr_info("Unsupported type: %s\n", t); -		return -EINVAL; +	mutex_lock(&probe_lock); + +	/* Delete old (same name) event if exist */ +	old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), +			tk->tp.call.class->system); +	if (old_tk) { +		ret = unregister_trace_kprobe(old_tk); +		if (ret < 0) +			goto end; +		free_trace_kprobe(old_tk);  	} -	parg->offset = tp->size; -	tp->size += parg->type->size; -	ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); -	if (ret >= 0) { -		parg->fetch_size.fn = get_fetch_size_function(parg->type, -							      parg->fetch.fn); -		parg->fetch_size.data = parg->fetch.data; + +	/* Register new event */ +	ret = register_kprobe_event(tk); +	if (ret) { +		pr_warning("Failed to register probe event(%d)\n", ret); +		goto end;  	} + +	/* Register k*probe */ +	ret = __register_trace_kprobe(tk); +	if (ret < 0) +		unregister_kprobe_event(tk); +	else +		list_add_tail(&tk->list, &probe_list); + +end: +	mutex_unlock(&probe_lock);  	return ret;  } -/* Return 1 if name is reserved or already used by another argument */ -static int conflict_field_name(const char *name, -			       struct probe_arg *args, int narg) +/* Module notifier call back, checking event on the module */ +static int trace_kprobe_module_callback(struct notifier_block *nb, +				       unsigned long val, void *data)  { -	int i; -	for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) -		if (strcmp(reserved_field_names[i], name) == 0) -			return 1; -	for (i = 0; i < narg; i++) -		if (strcmp(args[i].name, name) == 0) -			return 1; -	return 0; +	struct module *mod = data; +	struct trace_kprobe *tk; +	int ret; + +	if (val != MODULE_STATE_COMING) +		return NOTIFY_DONE; + +	/* Update probes on coming module */ +	mutex_lock(&probe_lock); +	list_for_each_entry(tk, &probe_list, list) { +		if (trace_kprobe_within_module(tk, mod)) { +			/* Don't need to check busy - this should have gone. */ +			__unregister_trace_kprobe(tk); +			ret = __register_trace_kprobe(tk); +			if (ret) +				pr_warning("Failed to re-register probe %s on" +					   "%s: %d\n", +					   ftrace_event_name(&tk->tp.call), +					   mod->name, ret); +		} +	} +	mutex_unlock(&probe_lock); + +	return NOTIFY_DONE;  } -static int create_trace_probe(int argc, char **argv) +static struct notifier_block trace_kprobe_module_nb = { +	.notifier_call = trace_kprobe_module_callback, +	.priority = 1	/* Invoked after kprobe module callback */ +}; + +static int create_trace_kprobe(int argc, char **argv)  {  	/*  	 * Argument syntax: -	 *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] -	 *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] +	 *  - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] +	 *  - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]  	 * Fetch args:  	 *  $retval	: fetch return value  	 *  $stack	: fetch stack address @@ -878,9 +606,9 @@ static int create_trace_probe(int argc, char **argv)  	 * Type of args:  	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.  	 */ -	struct trace_probe *tp; +	struct trace_kprobe *tk;  	int i, ret = 0; -	int is_return = 0, is_delete = 0; +	bool is_return = false, is_delete = false;  	char *symbol = NULL, *event = NULL, *group = NULL;  	char *arg;  	unsigned long offset = 0; @@ -889,11 +617,11 @@ static int create_trace_probe(int argc, char **argv)  	/* argc must be >= 1 */  	if (argv[0][0] == 'p') -		is_return = 0; +		is_return = false;  	else if (argv[0][0] == 'r') -		is_return = 1; +		is_return = true;  	else if (argv[0][0] == '-') -		is_delete = 1; +		is_delete = true;  	else {  		pr_info("Probe definition must be started with 'p', 'r' or"  			" '-'.\n"); @@ -925,17 +653,18 @@ static int create_trace_probe(int argc, char **argv)  			return -EINVAL;  		}  		mutex_lock(&probe_lock); -		tp = find_probe_event(event, group); -		if (!tp) { +		tk = find_trace_kprobe(event, group); +		if (!tk) {  			mutex_unlock(&probe_lock);  			pr_info("Event %s/%s doesn't exist.\n", group, event);  			return -ENOENT;  		}  		/* delete an event */ -		unregister_trace_probe(tp); -		free_trace_probe(tp); +		ret = unregister_trace_kprobe(tk); +		if (ret == 0) +			free_trace_kprobe(tk);  		mutex_unlock(&probe_lock); -		return 0; +		return ret;  	}  	if (argc < 2) { @@ -948,7 +677,7 @@ static int create_trace_probe(int argc, char **argv)  			return -EINVAL;  		}  		/* an address specified */ -		ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); +		ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);  		if (ret) {  			pr_info("Failed to parse address.\n");  			return ret; @@ -957,7 +686,7 @@ static int create_trace_probe(int argc, char **argv)  		/* a symbol specified */  		symbol = argv[1];  		/* TODO: support .init module functions */ -		ret = split_symbol_offset(symbol, &offset); +		ret = traceprobe_split_symbol_offset(symbol, &offset);  		if (ret) {  			pr_info("Failed to parse symbol.\n");  			return ret; @@ -980,46 +709,49 @@ static int create_trace_probe(int argc, char **argv)  				 is_return ? 'r' : 'p', addr);  		event = buf;  	} -	tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, +	tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,  			       is_return); -	if (IS_ERR(tp)) { +	if (IS_ERR(tk)) {  		pr_info("Failed to allocate trace_probe.(%d)\n", -			(int)PTR_ERR(tp)); -		return PTR_ERR(tp); +			(int)PTR_ERR(tk)); +		return PTR_ERR(tk);  	}  	/* parse arguments */  	ret = 0;  	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { +		struct probe_arg *parg = &tk->tp.args[i]; +  		/* Increment count for freeing args in error case */ -		tp->nr_args++; +		tk->tp.nr_args++;  		/* Parse argument name */  		arg = strchr(argv[i], '=');  		if (arg) {  			*arg++ = '\0'; -			tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); +			parg->name = kstrdup(argv[i], GFP_KERNEL);  		} else {  			arg = argv[i];  			/* If argument name is omitted, set "argN" */  			snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); -			tp->args[i].name = kstrdup(buf, GFP_KERNEL); +			parg->name = kstrdup(buf, GFP_KERNEL);  		} -		if (!tp->args[i].name) { +		if (!parg->name) {  			pr_info("Failed to allocate argument[%d] name.\n", i);  			ret = -ENOMEM;  			goto error;  		} -		if (!is_good_name(tp->args[i].name)) { +		if (!is_good_name(parg->name)) {  			pr_info("Invalid argument[%d] name: %s\n", -				i, tp->args[i].name); +				i, parg->name);  			ret = -EINVAL;  			goto error;  		} -		if (conflict_field_name(tp->args[i].name, tp->args, i)) { +		if (traceprobe_conflict_field_name(parg->name, +							tk->tp.args, i)) {  			pr_info("Argument[%d] name '%s' conflicts with "  				"another field.\n", i, argv[i]);  			ret = -EINVAL; @@ -1027,37 +759,50 @@ static int create_trace_probe(int argc, char **argv)  		}  		/* Parse fetch argument */ -		ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); +		ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, +						is_return, true);  		if (ret) {  			pr_info("Parse error at argument[%d]. (%d)\n", i, ret);  			goto error;  		}  	} -	ret = register_trace_probe(tp); +	ret = register_trace_kprobe(tk);  	if (ret)  		goto error;  	return 0;  error: -	free_trace_probe(tp); +	free_trace_kprobe(tk);  	return ret;  } -static void cleanup_all_probes(void) +static int release_all_trace_kprobes(void)  { -	struct trace_probe *tp; +	struct trace_kprobe *tk; +	int ret = 0;  	mutex_lock(&probe_lock); +	/* Ensure no probe is in use. */ +	list_for_each_entry(tk, &probe_list, list) +		if (trace_probe_is_enabled(&tk->tp)) { +			ret = -EBUSY; +			goto end; +		}  	/* TODO: Use batch unregistration */  	while (!list_empty(&probe_list)) { -		tp = list_entry(probe_list.next, struct trace_probe, list); -		unregister_trace_probe(tp); -		free_trace_probe(tp); +		tk = list_entry(probe_list.next, struct trace_kprobe, list); +		ret = unregister_trace_kprobe(tk); +		if (ret) +			goto end; +		free_trace_kprobe(tk);  	} + +end:  	mutex_unlock(&probe_lock); -} +	return ret; +}  /* Probes listing interfaces */  static void *probes_seq_start(struct seq_file *m, loff_t *pos) @@ -1078,21 +823,23 @@ static void probes_seq_stop(struct seq_file *m, void *v)  static int probes_seq_show(struct seq_file *m, void *v)  { -	struct trace_probe *tp = v; +	struct trace_kprobe *tk = v;  	int i; -	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); -	seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); +	seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); +	seq_printf(m, ":%s/%s", tk->tp.call.class->system, +			ftrace_event_name(&tk->tp.call)); -	if (!tp->symbol) -		seq_printf(m, " 0x%p", tp->rp.kp.addr); -	else if (tp->rp.kp.offset) -		seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); +	if (!tk->symbol) +		seq_printf(m, " 0x%p", tk->rp.kp.addr); +	else if (tk->rp.kp.offset) +		seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), +			   tk->rp.kp.offset);  	else -		seq_printf(m, " %s", probe_symbol(tp)); +		seq_printf(m, " %s", trace_kprobe_symbol(tk)); -	for (i = 0; i < tp->nr_args; i++) -		seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); +	for (i = 0; i < tk->tp.nr_args; i++) +		seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);  	seq_printf(m, "\n");  	return 0; @@ -1107,77 +854,22 @@ static const struct seq_operations probes_seq_op = {  static int probes_open(struct inode *inode, struct file *file)  { -	if ((file->f_mode & FMODE_WRITE) && -	    (file->f_flags & O_TRUNC)) -		cleanup_all_probes(); - -	return seq_open(file, &probes_seq_op); -} - -static int command_trace_probe(const char *buf) -{ -	char **argv; -	int argc = 0, ret = 0; - -	argv = argv_split(GFP_KERNEL, buf, &argc); -	if (!argv) -		return -ENOMEM; +	int ret; -	if (argc) -		ret = create_trace_probe(argc, argv); +	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { +		ret = release_all_trace_kprobes(); +		if (ret < 0) +			return ret; +	} -	argv_free(argv); -	return ret; +	return seq_open(file, &probes_seq_op);  } -#define WRITE_BUFSIZE 128 -  static ssize_t probes_write(struct file *file, const char __user *buffer,  			    size_t count, loff_t *ppos)  { -	char *kbuf, *tmp; -	int ret; -	size_t done; -	size_t size; - -	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); -	if (!kbuf) -		return -ENOMEM; - -	ret = done = 0; -	while (done < count) { -		size = count - done; -		if (size >= WRITE_BUFSIZE) -			size = WRITE_BUFSIZE - 1; -		if (copy_from_user(kbuf, buffer + done, size)) { -			ret = -EFAULT; -			goto out; -		} -		kbuf[size] = '\0'; -		tmp = strchr(kbuf, '\n'); -		if (tmp) { -			*tmp = '\0'; -			size = tmp - kbuf + 1; -		} else if (done + size < count) { -			pr_warning("Line length is too long: " -				   "Should be less than %d.", WRITE_BUFSIZE); -			ret = -EINVAL; -			goto out; -		} -		done += size; -		/* Remove comments */ -		tmp = strchr(kbuf, '#'); -		if (tmp) -			*tmp = '\0'; - -		ret = command_trace_probe(kbuf); -		if (ret) -			goto out; -	} -	ret = done; -out: -	kfree(kbuf); -	return ret; +	return traceprobe_probes_write(file, buffer, count, ppos, +			create_trace_kprobe);  }  static const struct file_operations kprobe_events_ops = { @@ -1192,10 +884,11 @@ static const struct file_operations kprobe_events_ops = {  /* Probes profiling interfaces */  static int probes_profile_seq_show(struct seq_file *m, void *v)  { -	struct trace_probe *tp = v; +	struct trace_kprobe *tk = v; -	seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit, -		   tp->rp.kp.nmissed); +	seq_printf(m, "  %-44s %15lu %15lu\n", +		   ftrace_event_name(&tk->tp.call), tk->nhit, +		   tk->rp.kp.nmissed);  	return 0;  } @@ -1220,120 +913,105 @@ static const struct file_operations kprobe_profile_ops = {  	.release        = seq_release,  }; -/* Sum up total data length for dynamic arraies (strings) */ -static __kprobes int __get_data_size(struct trace_probe *tp, -				     struct pt_regs *regs) -{ -	int i, ret = 0; -	u32 len; - -	for (i = 0; i < tp->nr_args; i++) -		if (unlikely(tp->args[i].fetch_size.fn)) { -			call_fetch(&tp->args[i].fetch_size, regs, &len); -			ret += len; -		} - -	return ret; -} - -/* Store the value of each argument */ -static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, -				       struct pt_regs *regs, -				       u8 *data, int maxlen) -{ -	int i; -	u32 end = tp->size; -	u32 *dl;	/* Data (relative) location */ - -	for (i = 0; i < tp->nr_args; i++) { -		if (unlikely(tp->args[i].fetch_size.fn)) { -			/* -			 * First, we set the relative location and -			 * maximum data length to *dl -			 */ -			dl = (u32 *)(data + tp->args[i].offset); -			*dl = make_data_rloc(maxlen, end - tp->args[i].offset); -			/* Then try to fetch string or dynamic array data */ -			call_fetch(&tp->args[i].fetch, regs, dl); -			/* Reduce maximum length */ -			end += get_rloc_len(*dl); -			maxlen -= get_rloc_len(*dl); -			/* Trick here, convert data_rloc to data_loc */ -			*dl = convert_rloc_to_loc(*dl, -				 ent_size + tp->args[i].offset); -		} else -			/* Just fetching data normally */ -			call_fetch(&tp->args[i].fetch, regs, -				   data + tp->args[i].offset); -	} -} -  /* Kprobe handler */ -static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +static nokprobe_inline void +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, +		    struct ftrace_event_file *ftrace_file)  { -	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);  	struct kprobe_trace_entry_head *entry;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	int size, dsize, pc;  	unsigned long irq_flags; -	struct ftrace_event_call *call = &tp->call; +	struct ftrace_event_call *call = &tk->tp.call; -	tp->nhit++; +	WARN_ON(call != ftrace_file->event_call); + +	if (ftrace_trigger_soft_disabled(ftrace_file)) +		return;  	local_save_flags(irq_flags);  	pc = preempt_count(); -	dsize = __get_data_size(tp, regs); -	size = sizeof(*entry) + tp->size + dsize; +	dsize = __get_data_size(&tk->tp, regs); +	size = sizeof(*entry) + tk->tp.size + dsize; -	event = trace_current_buffer_lock_reserve(&buffer, call->event.type, -						  size, irq_flags, pc); +	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, +						call->event.type, +						size, irq_flags, pc);  	if (!event)  		return;  	entry = ring_buffer_event_data(event); -	entry->ip = (unsigned long)kp->addr; -	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); +	entry->ip = (unsigned long)tk->rp.kp.addr; +	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + +	event_trigger_unlock_commit_regs(ftrace_file, buffer, event, +					 entry, irq_flags, pc, regs); +} -	if (!filter_current_check_discard(buffer, call, entry, event)) -		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); +static void +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) +{ +	struct event_file_link *link; + +	list_for_each_entry_rcu(link, &tk->tp.files, list) +		__kprobe_trace_func(tk, regs, link->file);  } +NOKPROBE_SYMBOL(kprobe_trace_func);  /* Kretprobe handler */ -static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, -					  struct pt_regs *regs) +static nokprobe_inline void +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, +		       struct pt_regs *regs, +		       struct ftrace_event_file *ftrace_file)  { -	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);  	struct kretprobe_trace_entry_head *entry;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	int size, pc, dsize;  	unsigned long irq_flags; -	struct ftrace_event_call *call = &tp->call; +	struct ftrace_event_call *call = &tk->tp.call; + +	WARN_ON(call != ftrace_file->event_call); + +	if (ftrace_trigger_soft_disabled(ftrace_file)) +		return;  	local_save_flags(irq_flags);  	pc = preempt_count(); -	dsize = __get_data_size(tp, regs); -	size = sizeof(*entry) + tp->size + dsize; +	dsize = __get_data_size(&tk->tp, regs); +	size = sizeof(*entry) + tk->tp.size + dsize; -	event = trace_current_buffer_lock_reserve(&buffer, call->event.type, -						  size, irq_flags, pc); +	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, +						call->event.type, +						size, irq_flags, pc);  	if (!event)  		return;  	entry = ring_buffer_event_data(event); -	entry->func = (unsigned long)tp->rp.kp.addr; +	entry->func = (unsigned long)tk->rp.kp.addr;  	entry->ret_ip = (unsigned long)ri->ret_addr; -	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); +	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); -	if (!filter_current_check_discard(buffer, call, entry, event)) -		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); +	event_trigger_unlock_commit_regs(ftrace_file, buffer, event, +					 entry, irq_flags, pc, regs);  } +static void +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, +		     struct pt_regs *regs) +{ +	struct event_file_link *link; + +	list_for_each_entry_rcu(link, &tk->tp.files, list) +		__kretprobe_trace_func(tk, ri, regs, link->file); +} +NOKPROBE_SYMBOL(kretprobe_trace_func); +  /* Event entry printers */ -enum print_line_t +static enum print_line_t  print_kprobe_event(struct trace_iterator *iter, int flags,  		   struct trace_event *event)  { @@ -1346,7 +1024,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,  	field = (struct kprobe_trace_entry_head *)iter->ent;  	tp = container_of(event, struct trace_probe, call.event); -	if (!trace_seq_printf(s, "%s: (", tp->call.name)) +	if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))  		goto partial;  	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1369,7 +1047,7 @@ partial:  	return TRACE_TYPE_PARTIAL_LINE;  } -enum print_line_t +static enum print_line_t  print_kretprobe_event(struct trace_iterator *iter, int flags,  		      struct trace_event *event)  { @@ -1382,7 +1060,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,  	field = (struct kretprobe_trace_entry_head *)iter->ent;  	tp = container_of(event, struct trace_probe, call.event); -	if (!trace_seq_printf(s, "%s: (", tp->call.name)) +	if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))  		goto partial;  	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1411,55 +1089,23 @@ partial:  	return TRACE_TYPE_PARTIAL_LINE;  } -static int probe_event_enable(struct ftrace_event_call *call) -{ -	struct trace_probe *tp = (struct trace_probe *)call->data; - -	tp->flags |= TP_FLAG_TRACE; -	if (probe_is_return(tp)) -		return enable_kretprobe(&tp->rp); -	else -		return enable_kprobe(&tp->rp.kp); -} - -static void probe_event_disable(struct ftrace_event_call *call) -{ -	struct trace_probe *tp = (struct trace_probe *)call->data; - -	tp->flags &= ~TP_FLAG_TRACE; -	if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { -		if (probe_is_return(tp)) -			disable_kretprobe(&tp->rp); -		else -			disable_kprobe(&tp->rp.kp); -	} -} - -#undef DEFINE_FIELD -#define DEFINE_FIELD(type, item, name, is_signed)			\ -	do {								\ -		ret = trace_define_field(event_call, #type, name,	\ -					 offsetof(typeof(field), item),	\ -					 sizeof(field.item), is_signed, \ -					 FILTER_OTHER);			\ -		if (ret)						\ -			return ret;					\ -	} while (0)  static int kprobe_event_define_fields(struct ftrace_event_call *event_call)  {  	int ret, i;  	struct kprobe_trace_entry_head field; -	struct trace_probe *tp = (struct trace_probe *)event_call->data; +	struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;  	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);  	/* Set argument names as fields */ -	for (i = 0; i < tp->nr_args; i++) { -		ret = trace_define_field(event_call, tp->args[i].type->fmttype, -					 tp->args[i].name, -					 sizeof(field) + tp->args[i].offset, -					 tp->args[i].type->size, -					 tp->args[i].type->is_signed, +	for (i = 0; i < tk->tp.nr_args; i++) { +		struct probe_arg *parg = &tk->tp.args[i]; + +		ret = trace_define_field(event_call, parg->type->fmttype, +					 parg->name, +					 sizeof(field) + parg->offset, +					 parg->type->size, +					 parg->type->is_signed,  					 FILTER_OTHER);  		if (ret)  			return ret; @@ -1471,17 +1117,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)  {  	int ret, i;  	struct kretprobe_trace_entry_head field; -	struct trace_probe *tp = (struct trace_probe *)event_call->data; +	struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;  	DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);  	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);  	/* Set argument names as fields */ -	for (i = 0; i < tp->nr_args; i++) { -		ret = trace_define_field(event_call, tp->args[i].type->fmttype, -					 tp->args[i].name, -					 sizeof(field) + tp->args[i].offset, -					 tp->args[i].type->size, -					 tp->args[i].type->is_signed, +	for (i = 0; i < tk->tp.nr_args; i++) { +		struct probe_arg *parg = &tk->tp.args[i]; + +		ret = trace_define_field(event_call, parg->type->fmttype, +					 parg->name, +					 sizeof(field) + parg->offset, +					 parg->type->size, +					 parg->type->is_signed,  					 FILTER_OTHER);  		if (ret)  			return ret; @@ -1489,206 +1137,135 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)  	return 0;  } -static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) -{ -	int i; -	int pos = 0; - -	const char *fmt, *arg; - -	if (!probe_is_return(tp)) { -		fmt = "(%lx)"; -		arg = "REC->" FIELD_STRING_IP; -	} else { -		fmt = "(%lx <- %lx)"; -		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; -	} - -	/* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - -	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - -	for (i = 0; i < tp->nr_args; i++) { -		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", -				tp->args[i].name, tp->args[i].type->fmt); -	} - -	pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - -	for (i = 0; i < tp->nr_args; i++) { -		if (strcmp(tp->args[i].type->name, "string") == 0) -			pos += snprintf(buf + pos, LEN_OR_ZERO, -					", __get_str(%s)", -					tp->args[i].name); -		else -			pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", -					tp->args[i].name); -	} - -#undef LEN_OR_ZERO - -	/* return the length of print_fmt */ -	return pos; -} - -static int set_print_fmt(struct trace_probe *tp) -{ -	int len; -	char *print_fmt; - -	/* First: called with 0 length to calculate the needed length */ -	len = __set_print_fmt(tp, NULL, 0); -	print_fmt = kmalloc(len + 1, GFP_KERNEL); -	if (!print_fmt) -		return -ENOMEM; - -	/* Second: actually write the @print_fmt */ -	__set_print_fmt(tp, print_fmt, len + 1); -	tp->call.print_fmt = print_fmt; - -	return 0; -} -  #ifdef CONFIG_PERF_EVENTS  /* Kprobe profile handler */ -static __kprobes void kprobe_perf_func(struct kprobe *kp, -					 struct pt_regs *regs) +static void +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)  { -	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); -	struct ftrace_event_call *call = &tp->call; +	struct ftrace_event_call *call = &tk->tp.call;  	struct kprobe_trace_entry_head *entry;  	struct hlist_head *head;  	int size, __size, dsize;  	int rctx; -	dsize = __get_data_size(tp, regs); -	__size = sizeof(*entry) + tp->size + dsize; +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		return; + +	dsize = __get_data_size(&tk->tp, regs); +	__size = sizeof(*entry) + tk->tp.size + dsize;  	size = ALIGN(__size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		     "profile buffer not large enough")) -		return;  	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);  	if (!entry)  		return; -	entry->ip = (unsigned long)kp->addr; +	entry->ip = (unsigned long)tk->rp.kp.addr;  	memset(&entry[1], 0, dsize); -	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - -	head = this_cpu_ptr(call->perf_events); -	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); +	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);  } +NOKPROBE_SYMBOL(kprobe_perf_func);  /* Kretprobe profile handler */ -static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, -					    struct pt_regs *regs) +static void +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, +		    struct pt_regs *regs)  { -	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); -	struct ftrace_event_call *call = &tp->call; +	struct ftrace_event_call *call = &tk->tp.call;  	struct kretprobe_trace_entry_head *entry;  	struct hlist_head *head;  	int size, __size, dsize;  	int rctx; -	dsize = __get_data_size(tp, regs); -	__size = sizeof(*entry) + tp->size + dsize; +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		return; + +	dsize = __get_data_size(&tk->tp, regs); +	__size = sizeof(*entry) + tk->tp.size + dsize;  	size = ALIGN(__size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		     "profile buffer not large enough")) -		return;  	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);  	if (!entry)  		return; -	entry->func = (unsigned long)tp->rp.kp.addr; +	entry->func = (unsigned long)tk->rp.kp.addr;  	entry->ret_ip = (unsigned long)ri->ret_addr; -	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - -	head = this_cpu_ptr(call->perf_events); -	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); -} - -static int probe_perf_enable(struct ftrace_event_call *call) -{ -	struct trace_probe *tp = (struct trace_probe *)call->data; - -	tp->flags |= TP_FLAG_PROFILE; - -	if (probe_is_return(tp)) -		return enable_kretprobe(&tp->rp); -	else -		return enable_kprobe(&tp->rp.kp); -} - -static void probe_perf_disable(struct ftrace_event_call *call) -{ -	struct trace_probe *tp = (struct trace_probe *)call->data; - -	tp->flags &= ~TP_FLAG_PROFILE; - -	if (!(tp->flags & TP_FLAG_TRACE)) { -		if (probe_is_return(tp)) -			disable_kretprobe(&tp->rp); -		else -			disable_kprobe(&tp->rp.kp); -	} +	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);  } +NOKPROBE_SYMBOL(kretprobe_perf_func);  #endif	/* CONFIG_PERF_EVENTS */ -static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ +static int kprobe_register(struct ftrace_event_call *event, +			   enum trace_reg type, void *data)  { +	struct trace_kprobe *tk = (struct trace_kprobe *)event->data; +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return probe_event_enable(event); +		return enable_trace_kprobe(tk, file);  	case TRACE_REG_UNREGISTER: -		probe_event_disable(event); -		return 0; +		return disable_trace_kprobe(tk, file);  #ifdef CONFIG_PERF_EVENTS  	case TRACE_REG_PERF_REGISTER: -		return probe_perf_enable(event); +		return enable_trace_kprobe(tk, NULL);  	case TRACE_REG_PERF_UNREGISTER: -		probe_perf_disable(event); +		return disable_trace_kprobe(tk, NULL); +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL:  		return 0;  #endif  	}  	return 0;  } -static __kprobes -int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)  { -	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); +	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + +	tk->nhit++; -	if (tp->flags & TP_FLAG_TRACE) -		kprobe_trace_func(kp, regs); +	if (tk->tp.flags & TP_FLAG_TRACE) +		kprobe_trace_func(tk, regs);  #ifdef CONFIG_PERF_EVENTS -	if (tp->flags & TP_FLAG_PROFILE) -		kprobe_perf_func(kp, regs); +	if (tk->tp.flags & TP_FLAG_PROFILE) +		kprobe_perf_func(tk, regs);  #endif  	return 0;	/* We don't tweek kernel, so just return 0 */  } +NOKPROBE_SYMBOL(kprobe_dispatcher); -static __kprobes -int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)  { -	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); +	struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); -	if (tp->flags & TP_FLAG_TRACE) -		kretprobe_trace_func(ri, regs); +	tk->nhit++; + +	if (tk->tp.flags & TP_FLAG_TRACE) +		kretprobe_trace_func(tk, ri, regs);  #ifdef CONFIG_PERF_EVENTS -	if (tp->flags & TP_FLAG_PROFILE) -		kretprobe_perf_func(ri, regs); +	if (tk->tp.flags & TP_FLAG_PROFILE) +		kretprobe_perf_func(tk, ri, regs);  #endif  	return 0;	/* We don't tweek kernel, so just return 0 */  } +NOKPROBE_SYMBOL(kretprobe_dispatcher);  static struct trace_event_functions kretprobe_funcs = {  	.trace		= print_kretprobe_event @@ -1698,21 +1275,21 @@ static struct trace_event_functions kprobe_funcs = {  	.trace		= print_kprobe_event  }; -static int register_probe_event(struct trace_probe *tp) +static int register_kprobe_event(struct trace_kprobe *tk)  { -	struct ftrace_event_call *call = &tp->call; +	struct ftrace_event_call *call = &tk->tp.call;  	int ret;  	/* Initialize ftrace_event_call */  	INIT_LIST_HEAD(&call->class->fields); -	if (probe_is_return(tp)) { +	if (trace_kprobe_is_return(tk)) {  		call->event.funcs = &kretprobe_funcs;  		call->class->define_fields = kretprobe_event_define_fields;  	} else {  		call->event.funcs = &kprobe_funcs;  		call->class->define_fields = kprobe_event_define_fields;  	} -	if (set_print_fmt(tp) < 0) +	if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)  		return -ENOMEM;  	ret = register_ftrace_event(&call->event);  	if (!ret) { @@ -1721,29 +1298,37 @@ static int register_probe_event(struct trace_probe *tp)  	}  	call->flags = 0;  	call->class->reg = kprobe_register; -	call->data = tp; +	call->data = tk;  	ret = trace_add_event_call(call);  	if (ret) { -		pr_info("Failed to register kprobe event: %s\n", call->name); +		pr_info("Failed to register kprobe event: %s\n", +			ftrace_event_name(call));  		kfree(call->print_fmt);  		unregister_ftrace_event(&call->event);  	}  	return ret;  } -static void unregister_probe_event(struct trace_probe *tp) +static int unregister_kprobe_event(struct trace_kprobe *tk)  { +	int ret; +  	/* tp->event is unregistered in trace_remove_event_call() */ -	trace_remove_event_call(&tp->call); -	kfree(tp->call.print_fmt); +	ret = trace_remove_event_call(&tk->tp.call); +	if (!ret) +		kfree(tk->tp.call.print_fmt); +	return ret;  } -/* Make a debugfs interface for controling probe points */ +/* Make a debugfs interface for controlling probe points */  static __init int init_kprobe_trace(void)  {  	struct dentry *d_tracer;  	struct dentry *entry; +	if (register_module_notifier(&trace_kprobe_module_nb)) +		return -EINVAL; +  	d_tracer = tracing_init_dentry();  	if (!d_tracer)  		return 0; @@ -1770,50 +1355,87 @@ fs_initcall(init_kprobe_trace);  #ifdef CONFIG_FTRACE_STARTUP_TEST -static int kprobe_trace_selftest_target(int a1, int a2, int a3, -					int a4, int a5, int a6) +/* + * The "__used" keeps gcc from removing the function symbol + * from the kallsyms table. + */ +static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, +					       int a4, int a5, int a6)  {  	return a1 + a2 + a3 + a4 + a5 + a6;  } +static struct ftrace_event_file * +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) +{ +	struct ftrace_event_file *file; + +	list_for_each_entry(file, &tr->events, list) +		if (file->event_call == &tk->tp.call) +			return file; + +	return NULL; +} + +/* + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this + * stage, we can do this lockless. + */  static __init int kprobe_trace_self_tests_init(void)  {  	int ret, warn = 0;  	int (*target)(int, int, int, int, int, int); -	struct trace_probe *tp; +	struct trace_kprobe *tk; +	struct ftrace_event_file *file; + +	if (tracing_is_disabled()) +		return -ENODEV;  	target = kprobe_trace_selftest_target;  	pr_info("Testing kprobe tracing: "); -	ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " -				  "$stack $stack0 +0($stack)"); +	ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " +				  "$stack $stack0 +0($stack)", +				  create_trace_kprobe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on probing function entry.\n"); +		pr_warn("error on probing function entry.\n");  		warn++;  	} else {  		/* Enable trace point */ -		tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); -		if (WARN_ON_ONCE(tp == NULL)) { -			pr_warning("error on getting new probe.\n"); +		tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); +		if (WARN_ON_ONCE(tk == NULL)) { +			pr_warn("error on getting new probe.\n");  			warn++; -		} else -			probe_event_enable(&tp->call); +		} else { +			file = find_trace_probe_file(tk, top_trace_array()); +			if (WARN_ON_ONCE(file == NULL)) { +				pr_warn("error on getting probe file.\n"); +				warn++; +			} else +				enable_trace_kprobe(tk, file); +		}  	} -	ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " -				  "$retval"); +	ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " +				  "$retval", create_trace_kprobe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on probing function return.\n"); +		pr_warn("error on probing function return.\n");  		warn++;  	} else {  		/* Enable trace point */ -		tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); -		if (WARN_ON_ONCE(tp == NULL)) { -			pr_warning("error on getting new probe.\n"); +		tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); +		if (WARN_ON_ONCE(tk == NULL)) { +			pr_warn("error on getting 2nd new probe.\n");  			warn++; -		} else -			probe_event_enable(&tp->call); +		} else { +			file = find_trace_probe_file(tk, top_trace_array()); +			if (WARN_ON_ONCE(file == NULL)) { +				pr_warn("error on getting probe file.\n"); +				warn++; +			} else +				enable_trace_kprobe(tk, file); +		}  	}  	if (warn) @@ -1821,20 +1443,47 @@ static __init int kprobe_trace_self_tests_init(void)  	ret = target(1, 2, 3, 4, 5, 6); -	ret = command_trace_probe("-:testprobe"); +	/* Disable trace points before removing it */ +	tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); +	if (WARN_ON_ONCE(tk == NULL)) { +		pr_warn("error on getting test probe.\n"); +		warn++; +	} else { +		file = find_trace_probe_file(tk, top_trace_array()); +		if (WARN_ON_ONCE(file == NULL)) { +			pr_warn("error on getting probe file.\n"); +			warn++; +		} else +			disable_trace_kprobe(tk, file); +	} + +	tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); +	if (WARN_ON_ONCE(tk == NULL)) { +		pr_warn("error on getting 2nd test probe.\n"); +		warn++; +	} else { +		file = find_trace_probe_file(tk, top_trace_array()); +		if (WARN_ON_ONCE(file == NULL)) { +			pr_warn("error on getting probe file.\n"); +			warn++; +		} else +			disable_trace_kprobe(tk, file); +	} + +	ret = traceprobe_command("-:testprobe", create_trace_kprobe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on deleting a probe.\n"); +		pr_warn("error on deleting a probe.\n");  		warn++;  	} -	ret = command_trace_probe("-:testprobe2"); +	ret = traceprobe_command("-:testprobe2", create_trace_kprobe);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error on deleting a probe.\n"); +		pr_warn("error on deleting a probe.\n");  		warn++;  	}  end: -	cleanup_all_probes(); +	release_all_trace_kprobes();  	if (warn)  		pr_cont("NG: Some tests are failed. Please check them.\n");  	else diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 017fa376505..0abd9b86347 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -12,7 +12,7 @@  #include <linux/slab.h>  #include <linux/time.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include "trace.h"  #include "trace_output.h" @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)  	overrun_detected = false;  	prev_overruns = 0; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  }  static int mmio_trace_init(struct trace_array *tr) @@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)  	if (drv)  		ret += trace_seq_printf(s, " %s\n", drv->name);  	else -		ret += trace_seq_printf(s, " \n"); +		ret += trace_seq_puts(s, " \n");  	return ret;  } @@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)  	struct header_iter *hiter;  	struct trace_seq *s = &iter->seq; -	trace_seq_printf(s, "VERSION 20070824\n"); +	trace_seq_puts(s, "VERSION 20070824\n");  	hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);  	if (!hiter) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)  static unsigned long count_overruns(struct trace_iterator *iter)  {  	unsigned long cnt = atomic_xchg(&dropped_count, 0); -	unsigned long over = ring_buffer_overruns(iter->tr->buffer); +	unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);  	if (over > prev_overruns)  		cnt += over - prev_overruns; @@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)  			(rw->value >> 0) & 0xff, rw->pc, 0);  		break;  	default: -		ret = trace_seq_printf(s, "rw what?\n"); +		ret = trace_seq_puts(s, "rw what?\n");  		break;  	}  	if (ret) @@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)  			secs, usec_rem, m->map_id, 0UL, 0);  		break;  	default: -		ret = trace_seq_printf(s, "map what?\n"); +		ret = trace_seq_puts(s, "map what?\n");  		break;  	}  	if (ret) @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  				struct mmiotrace_rw *rw)  {  	struct ftrace_event_call *call = &event_mmiotrace_rw; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_rw *entry;  	int pc = preempt_count(); @@ -323,14 +323,14 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->rw			= *rw; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, 0, pc);  }  void mmio_trace_rw(struct mmiotrace_rw *rw)  {  	struct trace_array *tr = mmio_trace_array; -	struct trace_array_cpu *data = tr->data[smp_processor_id()]; +	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_rw(tr, data, rw);  } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  				struct mmiotrace_map *map)  {  	struct ftrace_event_call *call = &event_mmiotrace_map; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_map *entry;  	int pc = preempt_count(); @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->map			= *map; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, 0, pc);  } @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)  	struct trace_array_cpu *data;  	preempt_disable(); -	data = tr->data[smp_processor_id()]; +	data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());  	__trace_mmiotrace_map(tr, data, map);  	preempt_enable();  } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 394f94417e2..fcf0a9e4891 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)   * If you don't implement it, then the flag setting will be   * automatically accepted.   */ -static int nop_set_flag(u32 old_flags, u32 bit, int set) +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	/*  	 * Note that you don't need to update nop_flags.val yourself. @@ -91,11 +91,11 @@ struct tracer nop_trace __read_mostly =  	.name		= "nop",  	.init		= nop_trace_init,  	.reset		= nop_trace_reset, -	.wait_pipe	= poll_wait_pipe,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest	= trace_selftest_startup_nop,  #endif  	.flags		= &nop_flags, -	.set_flag	= nop_set_flag +	.set_flag	= nop_set_flag, +	.allow_instances = true,  }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa220..f3dad80c20b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@  /* must be a power of 2 */  #define EVENT_HASHSIZE	128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem);  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)  	return ret;  } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ +	struct trace_seq *s = &iter->seq; +	struct trace_entry *entry = iter->ent; +	struct bputs_entry *field; +	int ret; + +	trace_assign_type(field, entry); + +	ret = trace_seq_puts(s, field->str); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return TRACE_TYPE_HANDLED; +} +  enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq; @@ -62,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)  	trace_assign_type(field, entry); -	ret = trace_seq_printf(s, "%s", field->buf); +	ret = trace_seq_puts(s, field->buf);  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -110,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)  EXPORT_SYMBOL_GPL(trace_seq_printf);  /** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s:		trace sequence descriptor + * @maskp:	points to an array of unsigned longs that represent a bitmask + * @nmaskbits:	The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, +		  int nmaskbits) +{ +	int len = (PAGE_SIZE - 1) - s->len; +	int ret; + +	if (s->full || !len) +		return 0; + +	ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); +	s->len += ret; + +	return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/**   * trace_seq_vprintf - sequence printing of trace information   * @s: trace sequence descriptor   * @fmt: printf format string @@ -264,7 +308,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)  	return ret;  } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path)  {  	unsigned char *p; @@ -300,7 +344,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,  	unsigned long mask;  	const char *str;  	const char *ret = p->buffer + p->len; -	int i; +	int i, first = 1;  	for (i = 0;  flag_array[i].name && flags; i++) { @@ -310,14 +354,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,  		str = flag_array[i].name;  		flags &= ~mask; -		if (p->len && delim) +		if (!first && delim)  			trace_seq_puts(p, delim); +		else +			first = 0;  		trace_seq_puts(p, str);  	}  	/* check for left over flags */  	if (flags) { -		if (p->len && delim) +		if (!first && delim)  			trace_seq_puts(p, delim);  		trace_seq_printf(p, "0x%lx", flags);  	} @@ -344,7 +390,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,  		break;  	} -	if (!p->len) +	if (ret == (const char *)(p->buffer + p->len))  		trace_seq_printf(p, "0x%lx", val);  	trace_seq_putc(p, 0); @@ -353,6 +399,46 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,  }  EXPORT_SYMBOL(ftrace_print_symbols_seq); +#if BITS_PER_LONG == 32 +const char * +ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, +			 const struct trace_print_flags_u64 *symbol_array) +{ +	int i; +	const char *ret = p->buffer + p->len; + +	for (i = 0;  symbol_array[i].name; i++) { + +		if (val != symbol_array[i].mask) +			continue; + +		trace_seq_puts(p, symbol_array[i].name); +		break; +	} + +	if (ret == (const char *)(p->buffer + p->len)) +		trace_seq_printf(p, "0x%llx", val); + +	trace_seq_putc(p, 0); + +	return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +#endif + +const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, +			 unsigned int bitmask_size) +{ +	const char *ret = p->buffer + p->len; + +	trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); +	trace_seq_putc(p, 0); + +	return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); +  const char *  ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  { @@ -368,6 +454,63 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  }  EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, +			   struct trace_event *trace_event) +{ +	struct ftrace_event_call *event; +	struct trace_seq *s = &iter->seq; +	struct trace_seq *p = &iter->tmp_seq; +	struct trace_entry *entry; +	int ret; + +	event = container_of(trace_event, struct ftrace_event_call, event); +	entry = iter->ent; + +	if (entry->type != event->event.type) { +		WARN_ON_ONCE(1); +		return TRACE_TYPE_UNHANDLED; +	} + +	trace_seq_init(p); +	ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); + +static int ftrace_output_raw(struct trace_iterator *iter, char *name, +			     char *fmt, va_list ap) +{ +	struct trace_seq *s = &iter->seq; +	int ret; + +	ret = trace_seq_printf(s, "%s: ", name); +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	ret = trace_seq_vprintf(s, fmt, ap); + +	if (!ret) +		return TRACE_TYPE_PARTIAL_LINE; + +	return TRACE_TYPE_HANDLED; +} + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ +	va_list ap; +	int ret; + +	va_start(ap, fmt); +	ret = ftrace_output_raw(iter, name, fmt, ap); +	va_end(ap); + +	return ret; +} +EXPORT_SYMBOL_GPL(ftrace_output_call); +  #ifdef CONFIG_KRETPROBES  static inline const char *kretprobed(const char *name)  { @@ -487,14 +630,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,  			if (ret)  				ret = trace_seq_puts(s, "??");  			if (ret) -				ret = trace_seq_puts(s, "\n"); +				ret = trace_seq_putc(s, '\n');  			continue;  		}  		if (!ret)  			break;  		if (ret)  			ret = seq_print_user_ip(s, mm, ip, sym_flags); -		ret = trace_seq_puts(s, "\n"); +		ret = trace_seq_putc(s, '\n');  	}  	if (mm) @@ -508,7 +651,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)  	int ret;  	if (!ip) -		return trace_seq_printf(s, "0"); +		return trace_seq_putc(s, '0');  	if (sym_flags & TRACE_ITER_SYM_OFFSET)  		ret = seq_print_sym_offset(s, "%s", ip); @@ -529,24 +672,49 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)   * @entry: The trace entry field from the ring buffer   *   * Prints the generic fields of irqs off, in hard or softirq, preempt - * count and lock depth. + * count.   */  int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)  { -	int hardirq, softirq; +	char hardsoft_irq; +	char need_resched; +	char irqs_off; +	int hardirq; +	int softirq;  	int ret;  	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;  	softirq = entry->flags & TRACE_FLAG_SOFTIRQ; +	irqs_off = +		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : +		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : +		'.'; + +	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | +				TRACE_FLAG_PREEMPT_RESCHED)) { +	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: +		need_resched = 'N'; +		break; +	case TRACE_FLAG_NEED_RESCHED: +		need_resched = 'n'; +		break; +	case TRACE_FLAG_PREEMPT_RESCHED: +		need_resched = 'p'; +		break; +	default: +		need_resched = '.'; +		break; +	} + +	hardsoft_irq = +		(hardirq && softirq) ? 'H' : +		hardirq ? 'h' : +		softirq ? 's' : +		'.'; +  	if (!trace_seq_printf(s, "%c%c%c", -			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : -				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? -				  'X' : '.', -			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ? -				'N' : '.', -			      (hardirq && softirq) ? 'H' : -				hardirq ? 'h' : softirq ? 's' : '.')) +			      irqs_off, need_resched, hardsoft_irq))  		return 0;  	if (entry->preempt_count) @@ -554,13 +722,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)  	else  		ret = trace_seq_putc(s, '.'); -	if (!ret) -		return 0; - -	if (entry->lock_depth < 0) -		return trace_seq_putc(s, '.'); - -	return trace_seq_printf(s, "%d", entry->lock_depth); +	return ret;  }  static int @@ -577,68 +739,113 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)  	return trace_print_lat_fmt(s, entry);  } -static unsigned long preempt_mark_thresh = 100; +static unsigned long preempt_mark_thresh_us = 100;  static int -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, -		    unsigned long rel_usecs) +lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)  { -	return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, -				rel_usecs > preempt_mark_thresh ? '!' : -				  rel_usecs > 1 ? '+' : ' '); +	unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; +	unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; +	unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; +	unsigned long long rel_ts = next_ts - iter->ts; +	struct trace_seq *s = &iter->seq; + +	if (in_ns) { +		abs_ts = ns2usecs(abs_ts); +		rel_ts = ns2usecs(rel_ts); +	} + +	if (verbose && in_ns) { +		unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); +		unsigned long abs_msec = (unsigned long)abs_ts; +		unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); +		unsigned long rel_msec = (unsigned long)rel_ts; + +		return trace_seq_printf( +				s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", +				ns2usecs(iter->ts), +				abs_msec, abs_usec, +				rel_msec, rel_usec); +	} else if (verbose && !in_ns) { +		return trace_seq_printf( +				s, "[%016llx] %lld (+%lld): ", +				iter->ts, abs_ts, rel_ts); +	} else if (!verbose && in_ns) { +		return trace_seq_printf( +				s, " %4lldus%c: ", +				abs_ts, +				rel_ts > preempt_mark_thresh_us ? '!' : +				  rel_ts > 1 ? '+' : ' '); +	} else { /* !verbose && !in_ns */ +		return trace_seq_printf(s, " %4lld: ", abs_ts); +	}  }  int trace_print_context(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq;  	struct trace_entry *entry = iter->ent; -	unsigned long long t = ns2usecs(iter->ts); -	unsigned long usec_rem = do_div(t, USEC_PER_SEC); -	unsigned long secs = (unsigned long)t; +	unsigned long long t; +	unsigned long secs, usec_rem;  	char comm[TASK_COMM_LEN]; +	int ret;  	trace_find_cmdline(entry->pid, comm); -	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", -				comm, entry->pid, iter->cpu, secs, usec_rem); +	ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", +			       comm, entry->pid, iter->cpu); +	if (!ret) +		return 0; + +	if (trace_flags & TRACE_ITER_IRQ_INFO) { +		ret = trace_print_lat_fmt(s, entry); +		if (!ret) +			return 0; +	} + +	if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { +		t = ns2usecs(iter->ts); +		usec_rem = do_div(t, USEC_PER_SEC); +		secs = (unsigned long)t; +		return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); +	} else +		return trace_seq_printf(s, " %12llu: ", iter->ts);  }  int trace_print_lat_context(struct trace_iterator *iter)  {  	u64 next_ts;  	int ret; +	/* trace_find_next_entry will reset ent_size */ +	int ent_size = iter->ent_size;  	struct trace_seq *s = &iter->seq;  	struct trace_entry *entry = iter->ent,  			   *next_entry = trace_find_next_entry(iter, NULL,  							       &next_ts);  	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); -	unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); -	unsigned long rel_usecs; + +	/* Restore the original ent_size */ +	iter->ent_size = ent_size;  	if (!next_entry)  		next_ts = iter->ts; -	rel_usecs = ns2usecs(next_ts - iter->ts);  	if (verbose) {  		char comm[TASK_COMM_LEN];  		trace_find_cmdline(entry->pid, comm); -		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" -				       " %ld.%03ldms (+%ld.%03ldms): ", comm, -				       entry->pid, iter->cpu, entry->flags, -				       entry->preempt_count, iter->idx, -				       ns2usecs(iter->ts), -				       abs_usecs / USEC_PER_MSEC, -				       abs_usecs % USEC_PER_MSEC, -				       rel_usecs / USEC_PER_MSEC, -				       rel_usecs % USEC_PER_MSEC); +		ret = trace_seq_printf( +				s, "%16s %5d %3d %d %08x %08lx ", +				comm, entry->pid, iter->cpu, entry->flags, +				entry->preempt_count, iter->idx);  	} else {  		ret = lat_print_generic(s, entry, iter->cpu); -		if (ret) -			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);  	} +	if (ret) +		ret = lat_print_timestamp(iter, next_ts); +  	return ret;  } @@ -661,12 +868,11 @@ static int task_state_char(unsigned long state)  struct trace_event *ftrace_find_event(int type)  {  	struct trace_event *event; -	struct hlist_node *n;  	unsigned key;  	key = type & (EVENT_HASHSIZE - 1); -	hlist_for_each_entry(event, n, &event_hash[key], node) { +	hlist_for_each_entry(event, &event_hash[key], node) {  		if (event->type == type)  			return event;  	} @@ -706,12 +912,12 @@ static int trace_search_list(struct list_head **list)  void trace_event_read_lock(void)  { -	down_read(&trace_event_mutex); +	down_read(&trace_event_sem);  }  void trace_event_read_unlock(void)  { -	up_read(&trace_event_mutex); +	up_read(&trace_event_sem);  }  /** @@ -734,7 +940,7 @@ int register_ftrace_event(struct trace_event *event)  	unsigned key;  	int ret = 0; -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	if (WARN_ON(!event))  		goto out; @@ -789,14 +995,14 @@ int register_ftrace_event(struct trace_event *event)  	ret = event->type;   out: -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return ret;  }  EXPORT_SYMBOL_GPL(register_ftrace_event);  /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write.   */  int __unregister_ftrace_event(struct trace_event *event)  { @@ -811,9 +1017,9 @@ int __unregister_ftrace_event(struct trace_event *event)   */  int unregister_ftrace_event(struct trace_event *event)  { -	down_write(&trace_event_mutex); +	down_write(&trace_event_sem);  	__unregister_ftrace_event(event); -	up_write(&trace_event_mutex); +	up_write(&trace_event_sem);  	return 0;  } @@ -826,6 +1032,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);  enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,  				  struct trace_event *event)  { +	if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) +		return TRACE_TYPE_PARTIAL_LINE; +  	return TRACE_TYPE_HANDLED;  } @@ -842,14 +1051,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,  		goto partial;  	if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { -		if (!trace_seq_printf(s, " <-")) +		if (!trace_seq_puts(s, " <-"))  			goto partial;  		if (!seq_print_ip_sym(s,  				      field->parent_ip,  				      flags))  			goto partial;  	} -	if (!trace_seq_printf(s, "\n")) +	if (!trace_seq_putc(s, '\n'))  		goto partial;  	return TRACE_TYPE_HANDLED; @@ -1073,21 +1282,22 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,  {  	struct stack_entry *field;  	struct trace_seq *s = &iter->seq; -	int i; +	unsigned long *p; +	unsigned long *end;  	trace_assign_type(field, iter->ent); +	end = (unsigned long *)((long)iter->ent + iter->ent_size);  	if (!trace_seq_puts(s, "<stack trace>\n"))  		goto partial; -	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { -		if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) -			break; + +	for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {  		if (!trace_seq_puts(s, " => "))  			goto partial; -		if (!seq_print_ip_sym(s, field->caller[i], flags)) +		if (!seq_print_ip_sym(s, *p, flags))  			goto partial; -		if (!trace_seq_puts(s, "\n")) +		if (!trace_seq_putc(s, '\n'))  			goto partial;  	} @@ -1136,6 +1346,64 @@ static struct trace_event trace_user_stack_event = {  	.funcs		= &trace_user_stack_funcs,  }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, +		   struct trace_event *event) +{ +	struct trace_entry *entry = iter->ent; +	struct trace_seq *s = &iter->seq; +	struct bputs_entry *field; + +	trace_assign_type(field, entry); + +	if (!seq_print_ip_sym(s, field->ip, flags)) +		goto partial; + +	if (!trace_seq_puts(s, ": ")) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, +		struct trace_event *event) +{ +	struct bputs_entry *field; +	struct trace_seq *s = &iter->seq; + +	trace_assign_type(field, iter->ent); + +	if (!trace_seq_printf(s, ": %lx : ", field->ip)) +		goto partial; + +	if (!trace_seq_puts(s, field->str)) +		goto partial; + +	return TRACE_TYPE_HANDLED; + + partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { +	.trace		= trace_bputs_print, +	.raw		= trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { +	.type		= TRACE_BPUTS, +	.funcs		= &trace_bputs_funcs, +}; +  /* TRACE_BPRINT */  static enum print_line_t  trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1248,6 +1516,7 @@ static struct trace_event *events[] __initdata = {  	&trace_wake_event,  	&trace_stack_event,  	&trace_user_stack_event, +	&trace_bputs_event,  	&trace_bprint_event,  	&trace_print_event,  	NULL @@ -1271,4 +1540,4 @@ __init static int init_events(void)  	return 0;  } -device_initcall(init_events); +early_initcall(init_events); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492..127a9d8c835 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@  #include "trace.h"  extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t  trace_print_bprintk_msg_only(struct trace_iterator *iter);  extern enum print_line_t  trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);  /* used by module unregistering */  extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem;  #define MAX_MEMHEX_BYTES	8  #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf..2900817ba65 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);  struct trace_bprintk_fmt {  	struct list_head list; -	char fmt[0]; +	const char *fmt;  };  static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) @@ -49,6 +49,11 @@ static  void hold_module_trace_bprintk_format(const char **start, const char **end)  {  	const char **iter; +	char *fmt; + +	/* allocate the trace_printk per cpu buffers */ +	if (start != end) +		trace_printk_init_buffers();  	mutex_lock(&btrace_mutex);  	for (iter = start; iter < end; iter++) { @@ -58,14 +63,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)  			continue;  		} -		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) -				+ strlen(*iter) + 1, GFP_KERNEL); +		fmt = NULL; +		tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);  		if (tb_fmt) { -			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); -			strcpy(tb_fmt->fmt, *iter); -			*iter = tb_fmt->fmt; -		} else -			*iter = NULL; +			fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); +			if (fmt) { +				list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); +				strcpy(fmt, *iter); +				tb_fmt->fmt = fmt; +			} else +				kfree(tb_fmt); +		} +		*iter = fmt; +  	}  	mutex_unlock(&btrace_mutex);  } @@ -84,6 +94,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,  	return 0;  } +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use containerof() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ +	struct trace_bprintk_fmt *mod_fmt; + +	if (list_empty(&trace_bprintk_fmt_list)) +		return NULL; + +	/* +	 * v will point to the address of the fmt record from t_next +	 * v will be NULL from t_start. +	 * If this is the first pointer or called from start +	 * then we need to walk the list. +	 */ +	if (!v || start_index == *pos) { +		struct trace_bprintk_fmt *p; + +		/* search the module list */ +		list_for_each_entry(p, &trace_bprintk_fmt_list, list) { +			if (start_index == *pos) +				return &p->fmt; +			start_index++; +		} +		/* pos > index */ +		return NULL; +	} + +	/* +	 * v points to the address of the fmt field in the mod list +	 * structure that holds the module print format. +	 */ +	mod_fmt = container_of(v, typeof(*mod_fmt), fmt); +	if (mod_fmt->list.next == &trace_bprintk_fmt_list) +		return NULL; + +	mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + +	return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ +	mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ +	mutex_unlock(&btrace_mutex); +} +  #else /* !CONFIG_MODULES */  __init static int  module_trace_bprintk_format_notify(struct notifier_block *self, @@ -91,6 +171,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,  {  	return 0;  } +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ +	return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { }  #endif /* CONFIG_MODULES */ @@ -153,20 +240,49 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)  }  EXPORT_SYMBOL_GPL(__ftrace_vprintk); +static const char **find_next(void *v, loff_t *pos) +{ +	const char **fmt = v; +	int start_index; +	int last_index; + +	start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + +	if (*pos < start_index) +		return __start___trace_bprintk_fmt + *pos; + +	/* +	 * The __tracepoint_str section is treated the same as the +	 * __trace_printk_fmt section. The difference is that the +	 * __trace_printk_fmt section should only be used by trace_printk() +	 * in a debugging environment, as if anything exists in that section +	 * the trace_prink() helper buffers are allocated, which would just +	 * waste space in a production environment. +	 * +	 * The __tracepoint_str sections on the other hand are used by +	 * tracepoints which need to map pointers to their strings to +	 * the ASCII text for userspace. +	 */ +	last_index = start_index; +	start_index = __stop___tracepoint_str - __start___tracepoint_str; + +	if (*pos < last_index + start_index) +		return __start___tracepoint_str + (*pos - last_index); + +	return find_next_mod_format(start_index, v, fmt, pos); +} +  static void *  t_start(struct seq_file *m, loff_t *pos)  { -	const char **fmt = __start___trace_bprintk_fmt + *pos; - -	if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) -		return NULL; -	return fmt; +	format_mod_start(); +	return find_next(NULL, pos);  }  static void *t_next(struct seq_file *m, void * v, loff_t *pos)  {  	(*pos)++; -	return t_start(m, pos); +	return find_next(v, pos);  }  static int t_show(struct seq_file *m, void *v) @@ -205,6 +321,7 @@ static int t_show(struct seq_file *m, void *v)  static void t_stop(struct seq_file *m, void *p)  { +	format_mod_stop();  }  static const struct seq_operations show_format_seq_ops = { diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 00000000000..d4b9fc22cd2 --- /dev/null +++ b/kernel/trace/trace_probe.c @@ -0,0 +1,726 @@ +/* + * Common code for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * + * This code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author:     Srikar Dronamraju + */ + +#include "trace_probe.h" + +const char *reserved_field_names[] = { +	"common_type", +	"common_flags", +	"common_preempt_count", +	"common_pid", +	"common_tgid", +	FIELD_STRING_IP, +	FIELD_STRING_RETIP, +	FIELD_STRING_FUNC, +}; + +/* Printing  in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt)				\ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,	\ +				void *data, void *ent)			\ +{									\ +	return trace_seq_printf(s, " %s=" fmt, name, *(type *)data);	\ +}									\ +const char PRINT_TYPE_FMT_NAME(type)[] = fmt;				\ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8,  "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") + +/* Print type function for string type */ +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, +				 void *data, void *ent) +{ +	int len = *(u32 *)data >> 16; + +	if (!len) +		return trace_seq_printf(s, " %s=(fault)", name); +	else +		return trace_seq_printf(s, " %s=\"%s\"", name, +					(const char *)get_loc_data(data, ent)); +} +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); + +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +#define CHECK_FETCH_FUNCS(method, fn)			\ +	(((FETCH_FUNC_NAME(method, u8) == fn) ||	\ +	  (FETCH_FUNC_NAME(method, u16) == fn) ||	\ +	  (FETCH_FUNC_NAME(method, u32) == fn) ||	\ +	  (FETCH_FUNC_NAME(method, u64) == fn) ||	\ +	  (FETCH_FUNC_NAME(method, string) == fn) ||	\ +	  (FETCH_FUNC_NAME(method, string_size) == fn)) \ +	 && (fn != NULL)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type)						\ +void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest)	\ +{									\ +	*(type *)dest = (type)regs_get_register(regs,			\ +				(unsigned int)((unsigned long)offset));	\ +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); +DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string	NULL +#define fetch_reg_string_size	NULL + +#define DEFINE_FETCH_retval(type)					\ +void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,		\ +				   void *dummy, void *dest)		\ +{									\ +	*(type *)dest = (type)regs_return_value(regs);			\ +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); +DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string		NULL +#define fetch_retval_string_size	NULL + +/* Dereference memory access function */ +struct deref_fetch_param { +	struct fetch_param	orig; +	long			offset; +	fetch_func_t		fetch; +	fetch_func_t		fetch_size; +}; + +#define DEFINE_FETCH_deref(type)					\ +void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,			\ +				  void *data, void *dest)		\ +{									\ +	struct deref_fetch_param *dprm = data;				\ +	unsigned long addr;						\ +	call_fetch(&dprm->orig, regs, &addr);				\ +	if (addr) {							\ +		addr += dprm->offset;					\ +		dprm->fetch(regs, (void *)addr, dest);			\ +	} else								\ +		*(type *)dest = 0;					\ +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); +DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) + +void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, +					 void *data, void *dest) +{ +	struct deref_fetch_param *dprm = data; +	unsigned long addr; + +	call_fetch(&dprm->orig, regs, &addr); +	if (addr && dprm->fetch_size) { +		addr += dprm->offset; +		dprm->fetch_size(regs, (void *)addr, dest); +	} else +		*(string_size *)dest = 0; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); + +static void update_deref_fetch_param(struct deref_fetch_param *data) +{ +	if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) +		update_deref_fetch_param(data->orig.data); +	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) +		update_symbol_cache(data->orig.data); +} +NOKPROBE_SYMBOL(update_deref_fetch_param); + +static void free_deref_fetch_param(struct deref_fetch_param *data) +{ +	if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) +		free_deref_fetch_param(data->orig.data); +	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) +		free_symbol_cache(data->orig.data); +	kfree(data); +} +NOKPROBE_SYMBOL(free_deref_fetch_param); + +/* Bitfield fetch function */ +struct bitfield_fetch_param { +	struct fetch_param	orig; +	unsigned char		hi_shift; +	unsigned char		low_shift; +}; + +#define DEFINE_FETCH_bitfield(type)					\ +void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,		\ +				     void *data, void *dest)		\ +{									\ +	struct bitfield_fetch_param *bprm = data;			\ +	type buf = 0;							\ +	call_fetch(&bprm->orig, regs, &buf);				\ +	if (buf) {							\ +		buf <<= bprm->hi_shift;					\ +		buf >>= bprm->low_shift;				\ +	}								\ +	*(type *)dest = buf;						\ +}									\ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string		NULL +#define fetch_bitfield_string_size	NULL + +static void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ +	/* +	 * Don't check the bitfield itself, because this must be the +	 * last fetch function. +	 */ +	if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) +		update_deref_fetch_param(data->orig.data); +	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) +		update_symbol_cache(data->orig.data); +} + +static void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ +	/* +	 * Don't check the bitfield itself, because this must be the +	 * last fetch function. +	 */ +	if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) +		free_deref_fetch_param(data->orig.data); +	else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) +		free_symbol_cache(data->orig.data); + +	kfree(data); +} + +static const struct fetch_type *find_fetch_type(const char *type, +						const struct fetch_type *ftbl) +{ +	int i; + +	if (!type) +		type = DEFAULT_FETCH_TYPE_STR; + +	/* Special case: bitfield */ +	if (*type == 'b') { +		unsigned long bs; + +		type = strchr(type, '/'); +		if (!type) +			goto fail; + +		type++; +		if (kstrtoul(type, 0, &bs)) +			goto fail; + +		switch (bs) { +		case 8: +			return find_fetch_type("u8", ftbl); +		case 16: +			return find_fetch_type("u16", ftbl); +		case 32: +			return find_fetch_type("u32", ftbl); +		case 64: +			return find_fetch_type("u64", ftbl); +		default: +			goto fail; +		} +	} + +	for (i = 0; ftbl[i].name; i++) { +		if (strcmp(type, ftbl[i].name) == 0) +			return &ftbl[i]; +	} + +fail: +	return NULL; +} + +/* Special function : only accept unsigned long */ +static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) +{ +	*(unsigned long *)dest = kernel_stack_pointer(regs); +} +NOKPROBE_SYMBOL(fetch_kernel_stack_address); + +static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) +{ +	*(unsigned long *)dest = user_stack_pointer(regs); +} +NOKPROBE_SYMBOL(fetch_user_stack_address); + +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, +					    fetch_func_t orig_fn, +					    const struct fetch_type *ftbl) +{ +	int i; + +	if (type != &ftbl[FETCH_TYPE_STRING]) +		return NULL;	/* Only string type needs size function */ + +	for (i = 0; i < FETCH_MTD_END; i++) +		if (type->fetch[i] == orig_fn) +			return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; + +	WARN_ON(1);	/* This should not happen */ + +	return NULL; +} + +/* Split symbol and offset. */ +int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +{ +	char *tmp; +	int ret; + +	if (!offset) +		return -EINVAL; + +	tmp = strchr(symbol, '+'); +	if (tmp) { +		/* skip sign because kstrtoul doesn't accept '+' */ +		ret = kstrtoul(tmp + 1, 0, offset); +		if (ret) +			return ret; + +		*tmp = '\0'; +	} else +		*offset = 0; + +	return 0; +} + +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, const struct fetch_type *t, +			    struct fetch_param *f, bool is_return, +			    bool is_kprobe) +{ +	int ret = 0; +	unsigned long param; + +	if (strcmp(arg, "retval") == 0) { +		if (is_return) +			f->fn = t->fetch[FETCH_MTD_retval]; +		else +			ret = -EINVAL; +	} else if (strncmp(arg, "stack", 5) == 0) { +		if (arg[5] == '\0') { +			if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) +				return -EINVAL; + +			if (is_kprobe) +				f->fn = fetch_kernel_stack_address; +			else +				f->fn = fetch_user_stack_address; +		} else if (isdigit(arg[5])) { +			ret = kstrtoul(arg + 5, 10, ¶m); +			if (ret || (is_kprobe && param > PARAM_MAX_STACK)) +				ret = -EINVAL; +			else { +				f->fn = t->fetch[FETCH_MTD_stack]; +				f->data = (void *)param; +			} +		} else +			ret = -EINVAL; +	} else +		ret = -EINVAL; + +	return ret; +} + +/* Recursive argument parser */ +static int parse_probe_arg(char *arg, const struct fetch_type *t, +		     struct fetch_param *f, bool is_return, bool is_kprobe) +{ +	const struct fetch_type *ftbl; +	unsigned long param; +	long offset; +	char *tmp; +	int ret = 0; + +	ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; +	BUG_ON(ftbl == NULL); + +	switch (arg[0]) { +	case '$': +		ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); +		break; + +	case '%':	/* named register */ +		ret = regs_query_register_offset(arg + 1); +		if (ret >= 0) { +			f->fn = t->fetch[FETCH_MTD_reg]; +			f->data = (void *)(unsigned long)ret; +			ret = 0; +		} +		break; + +	case '@':	/* memory, file-offset or symbol */ +		if (isdigit(arg[1])) { +			ret = kstrtoul(arg + 1, 0, ¶m); +			if (ret) +				break; + +			f->fn = t->fetch[FETCH_MTD_memory]; +			f->data = (void *)param; +		} else if (arg[1] == '+') { +			/* kprobes don't support file offsets */ +			if (is_kprobe) +				return -EINVAL; + +			ret = kstrtol(arg + 2, 0, &offset); +			if (ret) +				break; + +			f->fn = t->fetch[FETCH_MTD_file_offset]; +			f->data = (void *)offset; +		} else { +			/* uprobes don't support symbols */ +			if (!is_kprobe) +				return -EINVAL; + +			ret = traceprobe_split_symbol_offset(arg + 1, &offset); +			if (ret) +				break; + +			f->data = alloc_symbol_cache(arg + 1, offset); +			if (f->data) +				f->fn = t->fetch[FETCH_MTD_symbol]; +		} +		break; + +	case '+':	/* deref memory */ +		arg++;	/* Skip '+', because kstrtol() rejects it. */ +	case '-': +		tmp = strchr(arg, '('); +		if (!tmp) +			break; + +		*tmp = '\0'; +		ret = kstrtol(arg, 0, &offset); + +		if (ret) +			break; + +		arg = tmp + 1; +		tmp = strrchr(arg, ')'); + +		if (tmp) { +			struct deref_fetch_param	*dprm; +			const struct fetch_type		*t2; + +			t2 = find_fetch_type(NULL, ftbl); +			*tmp = '\0'; +			dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); + +			if (!dprm) +				return -ENOMEM; + +			dprm->offset = offset; +			dprm->fetch = t->fetch[FETCH_MTD_memory]; +			dprm->fetch_size = get_fetch_size_function(t, +							dprm->fetch, ftbl); +			ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, +							is_kprobe); +			if (ret) +				kfree(dprm); +			else { +				f->fn = t->fetch[FETCH_MTD_deref]; +				f->data = (void *)dprm; +			} +		} +		break; +	} +	if (!ret && !f->fn) {	/* Parsed, but do not find fetch method */ +		pr_info("%s type has no corresponding fetch method.\n", t->name); +		ret = -EINVAL; +	} + +	return ret; +} + +#define BYTES_TO_BITS(nb)	((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, +				      const struct fetch_type *t, +				      struct fetch_param *f) +{ +	struct bitfield_fetch_param *bprm; +	unsigned long bw, bo; +	char *tail; + +	if (*bf != 'b') +		return 0; + +	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); +	if (!bprm) +		return -ENOMEM; + +	bprm->orig = *f; +	f->fn = t->fetch[FETCH_MTD_bitfield]; +	f->data = (void *)bprm; +	bw = simple_strtoul(bf + 1, &tail, 0);	/* Use simple one */ + +	if (bw == 0 || *tail != '@') +		return -EINVAL; + +	bf = tail + 1; +	bo = simple_strtoul(bf, &tail, 0); + +	if (tail == bf || *tail != '/') +		return -EINVAL; + +	bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); +	bprm->low_shift = bprm->hi_shift + bo; + +	return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + +/* String length checking wrapper */ +int traceprobe_parse_probe_arg(char *arg, ssize_t *size, +		struct probe_arg *parg, bool is_return, bool is_kprobe) +{ +	const struct fetch_type *ftbl; +	const char *t; +	int ret; + +	ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; +	BUG_ON(ftbl == NULL); + +	if (strlen(arg) > MAX_ARGSTR_LEN) { +		pr_info("Argument is too long.: %s\n",  arg); +		return -ENOSPC; +	} +	parg->comm = kstrdup(arg, GFP_KERNEL); +	if (!parg->comm) { +		pr_info("Failed to allocate memory for command '%s'.\n", arg); +		return -ENOMEM; +	} +	t = strchr(parg->comm, ':'); +	if (t) { +		arg[t - parg->comm] = '\0'; +		t++; +	} +	parg->type = find_fetch_type(t, ftbl); +	if (!parg->type) { +		pr_info("Unsupported type: %s\n", t); +		return -EINVAL; +	} +	parg->offset = *size; +	*size += parg->type->size; +	ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); + +	if (ret >= 0 && t != NULL) +		ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); + +	if (ret >= 0) { +		parg->fetch_size.fn = get_fetch_size_function(parg->type, +							      parg->fetch.fn, +							      ftbl); +		parg->fetch_size.data = parg->fetch.data; +	} + +	return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +int traceprobe_conflict_field_name(const char *name, +			       struct probe_arg *args, int narg) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) +		if (strcmp(reserved_field_names[i], name) == 0) +			return 1; + +	for (i = 0; i < narg; i++) +		if (strcmp(args[i].name, name) == 0) +			return 1; + +	return 0; +} + +void traceprobe_update_arg(struct probe_arg *arg) +{ +	if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) +		update_bitfield_fetch_param(arg->fetch.data); +	else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) +		update_deref_fetch_param(arg->fetch.data); +	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) +		update_symbol_cache(arg->fetch.data); +} + +void traceprobe_free_probe_arg(struct probe_arg *arg) +{ +	if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) +		free_bitfield_fetch_param(arg->fetch.data); +	else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) +		free_deref_fetch_param(arg->fetch.data); +	else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) +		free_symbol_cache(arg->fetch.data); + +	kfree(arg->name); +	kfree(arg->comm); +} + +int traceprobe_command(const char *buf, int (*createfn)(int, char **)) +{ +	char **argv; +	int argc, ret; + +	argc = 0; +	ret = 0; +	argv = argv_split(GFP_KERNEL, buf, &argc); +	if (!argv) +		return -ENOMEM; + +	if (argc) +		ret = createfn(argc, argv); + +	argv_free(argv); + +	return ret; +} + +#define WRITE_BUFSIZE  4096 + +ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, +				size_t count, loff_t *ppos, +				int (*createfn)(int, char **)) +{ +	char *kbuf, *tmp; +	int ret = 0; +	size_t done = 0; +	size_t size; + +	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); +	if (!kbuf) +		return -ENOMEM; + +	while (done < count) { +		size = count - done; + +		if (size >= WRITE_BUFSIZE) +			size = WRITE_BUFSIZE - 1; + +		if (copy_from_user(kbuf, buffer + done, size)) { +			ret = -EFAULT; +			goto out; +		} +		kbuf[size] = '\0'; +		tmp = strchr(kbuf, '\n'); + +		if (tmp) { +			*tmp = '\0'; +			size = tmp - kbuf + 1; +		} else if (done + size < count) { +			pr_warning("Line length is too long: " +				   "Should be less than %d.", WRITE_BUFSIZE); +			ret = -EINVAL; +			goto out; +		} +		done += size; +		/* Remove comments */ +		tmp = strchr(kbuf, '#'); + +		if (tmp) +			*tmp = '\0'; + +		ret = traceprobe_command(kbuf, createfn); +		if (ret) +			goto out; +	} +	ret = done; + +out: +	kfree(kbuf); + +	return ret; +} + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, +			   bool is_return) +{ +	int i; +	int pos = 0; + +	const char *fmt, *arg; + +	if (!is_return) { +		fmt = "(%lx)"; +		arg = "REC->" FIELD_STRING_IP; +	} else { +		fmt = "(%lx <- %lx)"; +		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; +	} + +	/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + +	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + +	for (i = 0; i < tp->nr_args; i++) { +		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", +				tp->args[i].name, tp->args[i].type->fmt); +	} + +	pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + +	for (i = 0; i < tp->nr_args; i++) { +		if (strcmp(tp->args[i].type->name, "string") == 0) +			pos += snprintf(buf + pos, LEN_OR_ZERO, +					", __get_str(%s)", +					tp->args[i].name); +		else +			pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", +					tp->args[i].name); +	} + +#undef LEN_OR_ZERO + +	/* return the length of print_fmt */ +	return pos; +} + +int set_print_fmt(struct trace_probe *tp, bool is_return) +{ +	int len; +	char *print_fmt; + +	/* First: called with 0 length to calculate the needed length */ +	len = __set_print_fmt(tp, NULL, 0, is_return); +	print_fmt = kmalloc(len + 1, GFP_KERNEL); +	if (!print_fmt) +		return -ENOMEM; + +	/* Second: actually write the @print_fmt */ +	__set_print_fmt(tp, print_fmt, len + 1, is_return); +	tp->call.print_fmt = print_fmt; + +	return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 00000000000..4f815fbce16 --- /dev/null +++ b/kernel/trace/trace_probe.h @@ -0,0 +1,400 @@ +/* + * Common header file for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * + * This code was copied from kernel/trace/trace_kprobe.h written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author:     Srikar Dronamraju + */ + +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> +#include <linux/kprobes.h> +#include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> +#include <asm/bitsperlong.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS		128 +#define MAX_ARGSTR_LEN		63 +#define MAX_EVENT_NAME_LEN	64 +#define MAX_STRING_SIZE		PATH_MAX + +/* Reserved field names */ +#define FIELD_STRING_IP		"__probe_ip" +#define FIELD_STRING_RETIP	"__probe_ret_ip" +#define FIELD_STRING_FUNC	"__probe_func" + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed)			\ +	do {								\ +		ret = trace_define_field(event_call, #type, name,	\ +					 offsetof(typeof(field), item),	\ +					 sizeof(field.item), is_signed, \ +					 FILTER_OTHER);			\ +		if (ret)						\ +			return ret;					\ +	} while (0) + + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE		1 +#define TP_FLAG_PROFILE		2 +#define TP_FLAG_REGISTERED	4 + + +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs)	\ +	(((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl)		((u32)(dl) >> 16) +#define get_rloc_offs(dl)		((u32)(dl) & 0xffff) + +/* + * Convert data_rloc to data_loc: + *  data_rloc stores the offset from data_rloc itself, but data_loc + *  stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs)	((u32)(dl) + (offs)) + +static nokprobe_inline void *get_rloc_data(u32 *dl) +{ +	return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +{ +	return (u8 *)ent + get_rloc_offs(*dl); +} + +/* Data fetch function type */ +typedef	void (*fetch_func_t)(struct pt_regs *, void *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); + +/* Fetch types */ +enum { +	FETCH_MTD_reg = 0, +	FETCH_MTD_stack, +	FETCH_MTD_retval, +	FETCH_MTD_memory, +	FETCH_MTD_symbol, +	FETCH_MTD_deref, +	FETCH_MTD_bitfield, +	FETCH_MTD_file_offset, +	FETCH_MTD_END, +}; + +/* Fetch type information table */ +struct fetch_type { +	const char		*name;		/* Name of type */ +	size_t			size;		/* Byte size of type */ +	int			is_signed;	/* Signed flag */ +	print_type_func_t	print;		/* Print functions */ +	const char		*fmt;		/* Fromat string */ +	const char		*fmttype;	/* Name in format file */ +	/* Fetch functions */ +	fetch_func_t		fetch[FETCH_MTD_END]; +}; + +struct fetch_param { +	fetch_func_t		fn; +	void 			*data; +}; + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type)	print_type_##type +#define PRINT_TYPE_FMT_NAME(type)	print_type_format_##type + +/* Printing  in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type)				\ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,	\ +				void *data, void *ent);			\ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); + +#define FETCH_FUNC_NAME(method, type)	fetch_##method##_##type + +/* Declare macro for basic types */ +#define DECLARE_FETCH_FUNC(method, type)				\ +extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, 	\ +					  void *data, void *dest) + +#define DECLARE_BASIC_FETCH_FUNCS(method) 	\ +DECLARE_FETCH_FUNC(method, u8);			\ +DECLARE_FETCH_FUNC(method, u16);		\ +DECLARE_FETCH_FUNC(method, u32);		\ +DECLARE_FETCH_FUNC(method, u64) + +DECLARE_BASIC_FETCH_FUNCS(reg); +#define fetch_reg_string			NULL +#define fetch_reg_string_size			NULL + +DECLARE_BASIC_FETCH_FUNCS(retval); +#define fetch_retval_string			NULL +#define fetch_retval_string_size		NULL + +DECLARE_BASIC_FETCH_FUNCS(symbol); +DECLARE_FETCH_FUNC(symbol, string); +DECLARE_FETCH_FUNC(symbol, string_size); + +DECLARE_BASIC_FETCH_FUNCS(deref); +DECLARE_FETCH_FUNC(deref, string); +DECLARE_FETCH_FUNC(deref, string_size); + +DECLARE_BASIC_FETCH_FUNCS(bitfield); +#define fetch_bitfield_string			NULL +#define fetch_bitfield_string_size		NULL + +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8)		\ +DEFINE_FETCH_##method(u16)		\ +DEFINE_FETCH_##method(u32)		\ +DEFINE_FETCH_##method(u64) + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type)	\ +	[FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype)	\ +	{.name = _name,				\ +	 .size = _size,					\ +	 .is_signed = sign,				\ +	 .print = PRINT_TYPE_FUNC_NAME(ptype),		\ +	 .fmt = PRINT_TYPE_FMT_NAME(ptype),		\ +	 .fmttype = _fmttype,				\ +	 .fetch = {					\ +ASSIGN_FETCH_FUNC(reg, ftype),				\ +ASSIGN_FETCH_FUNC(stack, ftype),			\ +ASSIGN_FETCH_FUNC(retval, ftype),			\ +ASSIGN_FETCH_FUNC(memory, ftype),			\ +ASSIGN_FETCH_FUNC(symbol, ftype),			\ +ASSIGN_FETCH_FUNC(deref, ftype),			\ +ASSIGN_FETCH_FUNC(bitfield, ftype),			\ +ASSIGN_FETCH_FUNC(file_offset, ftype),			\ +	  }						\ +	} + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)			\ +	__ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define ASSIGN_FETCH_TYPE_END {} + +#define FETCH_TYPE_STRING	0 +#define FETCH_TYPE_STRSIZE	1 + +/* + * Fetch type information table. + * It's declared as a weak symbol due to conditional compilation. + */ +extern __weak const struct fetch_type kprobes_fetch_type_table[]; +extern __weak const struct fetch_type uprobes_fetch_type_table[]; + +#ifdef CONFIG_KPROBE_EVENT +struct symbol_cache; +unsigned long update_symbol_cache(struct symbol_cache *sc); +void free_symbol_cache(struct symbol_cache *sc); +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +#else +/* uprobes do not support symbol fetch methods */ +#define fetch_symbol_u8			NULL +#define fetch_symbol_u16		NULL +#define fetch_symbol_u32		NULL +#define fetch_symbol_u64		NULL +#define fetch_symbol_string		NULL +#define fetch_symbol_string_size	NULL + +struct symbol_cache { +}; +static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) +{ +	return 0; +} + +static inline void __used free_symbol_cache(struct symbol_cache *sc) +{ +} + +static inline struct symbol_cache * __used +alloc_symbol_cache(const char *sym, long offset) +{ +	return NULL; +} +#endif /* CONFIG_KPROBE_EVENT */ + +struct probe_arg { +	struct fetch_param	fetch; +	struct fetch_param	fetch_size; +	unsigned int		offset;	/* Offset from argument entry */ +	const char		*name;	/* Name of this argument */ +	const char		*comm;	/* Command of this argument */ +	const struct fetch_type	*type;	/* Type of this argument */ +}; + +struct trace_probe { +	unsigned int			flags;	/* For TP_FLAG_* */ +	struct ftrace_event_class	class; +	struct ftrace_event_call	call; +	struct list_head 		files; +	ssize_t				size;	/* trace entry size */ +	unsigned int			nr_args; +	struct probe_arg		args[]; +}; + +struct event_file_link { +	struct ftrace_event_file	*file; +	struct list_head		list; +}; + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ +	return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static inline bool trace_probe_is_registered(struct trace_probe *tp) +{ +	return !!(tp->flags & TP_FLAG_REGISTERED); +} + +static nokprobe_inline void call_fetch(struct fetch_param *fprm, +				 struct pt_regs *regs, void *dest) +{ +	return fprm->fn(regs, fprm->data, dest); +} + +/* Check the name is good for event/group/fields */ +static inline int is_good_name(const char *name) +{ +	if (!isalpha(*name) && *name != '_') +		return 0; +	while (*++name != '\0') { +		if (!isalpha(*name) && !isdigit(*name) && *name != '_') +			return 0; +	} +	return 1; +} + +static inline struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +{ +	struct event_file_link *link; + +	list_for_each_entry(link, &tp->files, list) +		if (link->file == file) +			return link; + +	return NULL; +} + +extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, +		   struct probe_arg *parg, bool is_return, bool is_kprobe); + +extern int traceprobe_conflict_field_name(const char *name, +			       struct probe_arg *args, int narg); + +extern void traceprobe_update_arg(struct probe_arg *arg); +extern void traceprobe_free_probe_arg(struct probe_arg *arg); + +extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); + +extern ssize_t traceprobe_probes_write(struct file *file, +		const char __user *buffer, size_t count, loff_t *ppos, +		int (*createfn)(int, char**)); + +extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ +	int i, ret = 0; +	u32 len; + +	for (i = 0; i < tp->nr_args; i++) +		if (unlikely(tp->args[i].fetch_size.fn)) { +			call_fetch(&tp->args[i].fetch_size, regs, &len); +			ret += len; +		} + +	return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, +		 u8 *data, int maxlen) +{ +	int i; +	u32 end = tp->size; +	u32 *dl;	/* Data (relative) location */ + +	for (i = 0; i < tp->nr_args; i++) { +		if (unlikely(tp->args[i].fetch_size.fn)) { +			/* +			 * First, we set the relative location and +			 * maximum data length to *dl +			 */ +			dl = (u32 *)(data + tp->args[i].offset); +			*dl = make_data_rloc(maxlen, end - tp->args[i].offset); +			/* Then try to fetch string or dynamic array data */ +			call_fetch(&tp->args[i].fetch, regs, dl); +			/* Reduce maximum length */ +			end += get_rloc_len(*dl); +			maxlen -= get_rloc_len(*dl); +			/* Trick here, convert data_rloc to data_loc */ +			*dl = convert_rloc_to_loc(*dl, +				 ent_size + tp->args[i].offset); +		} else +			/* Just fetching data normally */ +			call_fetch(&tp->args[i].fetch, regs, +				   data + tp->args[i].offset); +	} +} + +extern int set_print_fmt(struct trace_probe *tp, bool is_return); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c4..3f34dc9b40f 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  			   unsigned long flags, int pc)  {  	struct ftrace_event_call *call = &event_context_switch; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  	entry->next_state		= next->state;  	entry->next_cpu	= task_cpu(next); -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, flags, pc);  } @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	struct ftrace_event_call *call = &event_wakeup;  	struct ring_buffer_event *event;  	struct ctx_switch_entry *entry; -	struct ring_buffer *buffer = tr->buffer; +	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,  					  sizeof(*entry), flags, pc); @@ -101,10 +101,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	entry->next_state		= wakee->state;  	entry->next_cpu			= task_cpu(wakee); -	if (!filter_check_discard(call, entry, buffer, event)) -		ring_buffer_unlock_commit(buffer, event); -	ftrace_trace_stack(tr->buffer, flags, 6, pc); -	ftrace_trace_userstack(tr->buffer, flags, pc); +	if (!call_filter_check_discard(call, entry, buffer, event)) +		trace_buffer_unlock_commit(buffer, event, flags, pc);  }  static void @@ -125,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)  	pc = preempt_count();  	local_irq_save(flags);  	cpu = raw_smp_processor_id(); -	data = ctx_trace->data[cpu]; +	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);  	if (likely(!atomic_read(&data->disabled)))  		tracing_sched_wakeup_trace(ctx_trace, wakee, current, @@ -247,51 +245,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)  	ctx_trace = tr;  } -static void stop_sched_trace(struct trace_array *tr) -{ -	tracing_stop_sched_switch_record(); -} - -static int sched_switch_trace_init(struct trace_array *tr) -{ -	ctx_trace = tr; -	tracing_reset_online_cpus(tr); -	tracing_start_sched_switch_record(); -	return 0; -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ -	if (sched_ref) -		stop_sched_trace(tr); -} - -static void sched_switch_trace_start(struct trace_array *tr) -{ -	sched_stopped = 0; -} - -static void sched_switch_trace_stop(struct trace_array *tr) -{ -	sched_stopped = 1; -} - -static struct tracer sched_switch_trace __read_mostly = -{ -	.name		= "sched_switch", -	.init		= sched_switch_trace_init, -	.reset		= sched_switch_trace_reset, -	.start		= sched_switch_trace_start, -	.stop		= sched_switch_trace_stop, -	.wait_pipe	= poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST -	.selftest    = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ -	return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); - diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559ed59..19bd8928ce9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -7,7 +7,7 @@   * Based on code from the latency_tracer, that is:   *   *  Copyright (C) 2004-2006 Ingo Molnar - *  Copyright (C) 2004 William Lee Irwin III + *  Copyright (C) 2004 Nadia Yvette Chambers   */  #include <linux/module.h>  #include <linux/fs.h> @@ -15,8 +15,9 @@  #include <linux/kallsyms.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h>  #include <trace/events/sched.h> -  #include "trace.h"  static struct trace_array	*wakeup_trace; @@ -27,6 +28,8 @@ static int			wakeup_cpu;  static int			wakeup_current_cpu;  static unsigned			wakeup_prio = -1;  static int			wakeup_rt; +static int			wakeup_dl; +static int			tracing_dl = 0;  static arch_spinlock_t wakeup_lock =  	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -36,7 +39,8 @@ static void __wakeup_reset(struct trace_array *tr);  static int wakeup_graph_entry(struct ftrace_graph_ent *trace);  static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; +static bool function_enabled;  #define TRACE_DISPLAY_GRAPH     1 @@ -89,7 +93,7 @@ func_prolog_preempt_disable(struct trace_array *tr,  	if (cpu != wakeup_current_cpu)  		goto out_enable; -	*data = tr->data[cpu]; +	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	disabled = atomic_inc_return(&(*data)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -108,7 +112,8 @@ out_enable:   * wakeup uses its own tracer function to keep the overhead down:   */  static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, +		   struct ftrace_ops *op, struct pt_regs *pt_regs)  {  	struct trace_array *tr = wakeup_trace;  	struct trace_array_cpu *data; @@ -125,22 +130,64 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)  	atomic_dec(&data->disabled);  	preempt_enable_notrace();  } - -static struct ftrace_ops trace_ops __read_mostly = -{ -	.func = wakeup_tracer_call, -};  #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(struct trace_array *tr, int graph, int set)  {  	int ret; -	if (!graph) -		ret = register_ftrace_function(&trace_ops); -	else +	/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ +	if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) +		return 0; + +	if (graph)  		ret = register_ftrace_graph(&wakeup_graph_return,  					    &wakeup_graph_entry); +	else +		ret = register_ftrace_function(tr->ops); + +	if (!ret) +		function_enabled = true; + +	return ret; +} + +static void unregister_wakeup_function(struct trace_array *tr, int graph) +{ +	if (!function_enabled) +		return; + +	if (graph) +		unregister_ftrace_graph(); +	else +		unregister_ftrace_function(tr->ops); + +	function_enabled = false; +} + +static void wakeup_function_set(struct trace_array *tr, int set) +{ +	if (set) +		register_wakeup_function(tr, is_graph(), 1); +	else +		unregister_wakeup_function(tr, is_graph()); +} + +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +{ +	struct tracer *tracer = tr->current_trace; + +	if (mask & TRACE_ITER_FUNCTION) +		wakeup_function_set(tr, set); + +	return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(struct trace_array *tr, int graph) +{ +	int ret; + +	ret = register_wakeup_function(tr, graph, 0);  	if (!ret && tracing_is_enabled())  		tracer_enabled = 1; @@ -150,18 +197,16 @@ static int start_func_tracer(int graph)  	return ret;  } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph)  {  	tracer_enabled = 0; -	if (!graph) -		unregister_ftrace_function(&trace_ops); -	else -		unregister_ftrace_graph(); +	unregister_wakeup_function(tr, graph);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	if (!(bit & TRACE_DISPLAY_GRAPH)) @@ -170,12 +215,12 @@ static int wakeup_set_flag(u32 old_flags, u32 bit, int set)  	if (!(is_graph() ^ set))  		return 0; -	stop_func_tracer(!set); +	stop_func_tracer(tr, !set);  	wakeup_reset(wakeup_trace); -	tracing_max_latency = 0; +	tr->max_latency = 0; -	return start_func_tracer(set); +	return start_func_tracer(tr, set);  }  static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -226,7 +271,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)  		graph_trace_close(iter);  } -#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ +			    TRACE_GRAPH_PRINT_ABS_TIME | \ +			    TRACE_GRAPH_PRINT_DURATION)  static enum print_line_t wakeup_print_line(struct trace_iterator *iter)  { @@ -261,7 +308,8 @@ __trace_function(struct trace_array *tr,  #else  #define __trace_function trace_function -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)  {  	return -EINVAL;  } @@ -277,21 +325,32 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)  }  static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_print_header(struct seq_file *s) { }  static void wakeup_trace_open(struct trace_iterator *iter) { }  static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ +	trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ +	trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  /*   * Should this new latency be reported/recorded?   */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta)  {  	if (tracing_thresh) {  		if (delta < tracing_thresh)  			return 0;  	} else { -		if (delta <= tracing_max_latency) +		if (delta <= tr->max_latency)  			return 0;  	}  	return 1; @@ -338,7 +397,7 @@ probe_wakeup_sched_switch(void *ignore,  	/* disable local data, not wakeup_cpu data */  	cpu = raw_smp_processor_id(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (likely(disabled != 1))  		goto out; @@ -350,7 +409,7 @@ probe_wakeup_sched_switch(void *ignore,  		goto out_unlock;  	/* The task we are waiting for is waking up */ -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);  	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -359,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,  	T1 = ftrace_now(cpu);  	delta = T1-T0; -	if (!report_latency(delta)) +	if (!report_latency(wakeup_trace, delta))  		goto out_unlock;  	if (likely(!is_tracing_stopped())) { -		tracing_max_latency = delta; +		wakeup_trace->max_latency = delta;  		update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);  	} @@ -372,13 +431,14 @@ out_unlock:  	arch_spin_unlock(&wakeup_lock);  	local_irq_restore(flags);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void __wakeup_reset(struct trace_array *tr)  {  	wakeup_cpu = -1;  	wakeup_prio = -1; +	tracing_dl = 0;  	if (wakeup_task)  		put_task_struct(wakeup_task); @@ -390,7 +450,7 @@ static void wakeup_reset(struct trace_array *tr)  {  	unsigned long flags; -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	local_irq_save(flags);  	arch_spin_lock(&wakeup_lock); @@ -414,13 +474,21 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  	tracing_record_cmdline(p);  	tracing_record_cmdline(current); -	if ((wakeup_rt && !rt_task(p)) || -			p->prio >= wakeup_prio || -			p->prio >= current->prio) +	/* +	 * Semantic is like this: +	 *  - wakeup tracer handles all tasks in the system, independently +	 *    from their scheduling class; +	 *  - wakeup_rt tracer handles tasks belonging to sched_dl and +	 *    sched_rt class; +	 *  - wakeup_dl handles tasks belonging to sched_dl class only. +	 */ +	if (tracing_dl || (wakeup_dl && !dl_task(p)) || +	    (wakeup_rt && !dl_task(p) && !rt_task(p)) || +	    (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))  		return;  	pc = preempt_count(); -	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); +	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  	if (unlikely(disabled != 1))  		goto out; @@ -428,7 +496,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  	arch_spin_lock(&wakeup_lock);  	/* check for races. */ -	if (!tracer_enabled || p->prio >= wakeup_prio) +	if (!tracer_enabled || tracing_dl || +	    (!dl_task(p) && p->prio >= wakeup_prio))  		goto out_locked;  	/* reset the trace */ @@ -438,12 +507,21 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  	wakeup_current_cpu = wakeup_cpu;  	wakeup_prio = p->prio; +	/* +	 * Once you start tracing a -deadline task, don't bother tracing +	 * another task until the first one wakes up. +	 */ +	if (dl_task(p)) +		tracing_dl = 1; +	else +		tracing_dl = 0; +  	wakeup_task = p;  	get_task_struct(wakeup_task);  	local_save_flags(flags); -	data = wakeup_trace->data[wakeup_cpu]; +	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);  	data->preempt_timestamp = ftrace_now(cpu);  	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -457,7 +535,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)  out_locked:  	arch_spin_unlock(&wakeup_lock);  out: -	atomic_dec(&wakeup_trace->data[cpu]->disabled); +	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);  }  static void start_wakeup_tracer(struct trace_array *tr) @@ -503,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)  	 */  	smp_wmb(); -	if (start_func_tracer(is_graph())) +	if (start_func_tracer(tr, is_graph()))  		printk(KERN_ERR "failed to start wakeup tracer\n");  	return; @@ -516,44 +594,75 @@ fail_deprobe:  static void stop_wakeup_tracer(struct trace_array *tr)  {  	tracer_enabled = 0; -	stop_func_tracer(is_graph()); +	stop_func_tracer(tr, is_graph());  	unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);  	unregister_trace_sched_wakeup_new(probe_wakeup, NULL);  	unregister_trace_sched_wakeup(probe_wakeup, NULL);  	unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);  } +static bool wakeup_busy; +  static int __wakeup_tracer_init(struct trace_array *tr)  { -	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; -	trace_flags |= TRACE_ITER_LATENCY_FMT; +	save_flags = trace_flags; + +	/* non overwrite screws up the latency tracers */ +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); -	tracing_max_latency = 0; +	tr->max_latency = 0;  	wakeup_trace = tr; +	ftrace_init_array_ops(tr, wakeup_tracer_call);  	start_wakeup_tracer(tr); + +	wakeup_busy = true;  	return 0;  }  static int wakeup_tracer_init(struct trace_array *tr)  { +	if (wakeup_busy) +		return -EBUSY; + +	wakeup_dl = 0;  	wakeup_rt = 0;  	return __wakeup_tracer_init(tr);  }  static int wakeup_rt_tracer_init(struct trace_array *tr)  { +	if (wakeup_busy) +		return -EBUSY; + +	wakeup_dl = 0;  	wakeup_rt = 1;  	return __wakeup_tracer_init(tr);  } +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ +	if (wakeup_busy) +		return -EBUSY; + +	wakeup_dl = 1; +	wakeup_rt = 0; +	return __wakeup_tracer_init(tr); +} +  static void wakeup_tracer_reset(struct trace_array *tr)  { +	int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; +	int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; +  	stop_wakeup_tracer(tr);  	/* make sure we put back any tasks we are tracing */  	wakeup_reset(tr); -	if (!save_lat_flag) -		trace_flags &= ~TRACE_ITER_LATENCY_FMT; +	set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); +	set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); +	ftrace_reset_array_ops(tr); +	wakeup_busy = false;  }  static void wakeup_tracer_start(struct trace_array *tr) @@ -574,17 +683,19 @@ static struct tracer wakeup_tracer __read_mostly =  	.reset		= wakeup_tracer_reset,  	.start		= wakeup_tracer_start,  	.stop		= wakeup_tracer_stop, -	.print_max	= 1, +	.print_max	= true,  	.print_header	= wakeup_print_header,  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif  	.open		= wakeup_trace_open,  	.close		= wakeup_trace_close, -	.use_max_tr	= 1, +	.allow_instances = true, +	.use_max_tr	= true,  };  static struct tracer wakeup_rt_tracer __read_mostly = @@ -594,18 +705,40 @@ static struct tracer wakeup_rt_tracer __read_mostly =  	.reset		= wakeup_tracer_reset,  	.start		= wakeup_tracer_start,  	.stop		= wakeup_tracer_stop, -	.wait_pipe	= poll_wait_pipe, -	.print_max	= 1, +	.print_max	= true, +	.print_header	= wakeup_print_header, +	.print_line	= wakeup_print_line, +	.flags		= &tracer_flags, +	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST +	.selftest    = trace_selftest_startup_wakeup, +#endif +	.open		= wakeup_trace_open, +	.close		= wakeup_trace_close, +	.allow_instances = true, +	.use_max_tr	= true, +}; + +static struct tracer wakeup_dl_tracer __read_mostly = +{ +	.name		= "wakeup_dl", +	.init		= wakeup_dl_tracer_init, +	.reset		= wakeup_tracer_reset, +	.start		= wakeup_tracer_start, +	.stop		= wakeup_tracer_stop, +	.print_max	= true,  	.print_header	= wakeup_print_header,  	.print_line	= wakeup_print_line,  	.flags		= &tracer_flags,  	.set_flag	= wakeup_set_flag, +	.flag_changed	= wakeup_flag_changed,  #ifdef CONFIG_FTRACE_SELFTEST  	.selftest    = trace_selftest_startup_wakeup,  #endif  	.open		= wakeup_trace_open,  	.close		= wakeup_trace_close, -	.use_max_tr	= 1, +	.use_max_tr	= true,  };  __init static int init_wakeup_tracer(void) @@ -620,6 +753,10 @@ __init static int init_wakeup_tracer(void)  	if (ret)  		return ret; +	ret = register_tracer(&wakeup_dl_tracer); +	if (ret) +		return ret; +  	return 0;  } -device_initcall(init_wakeup_tracer); +core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b320..5ef60499dc8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)  	return 0;  } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)  {  	struct ring_buffer_event *event;  	struct trace_entry *entry;  	unsigned int loops = 0; -	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { +	while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {  		entry = ring_buffer_event_data(event);  		/* @@ -58,16 +58,16 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)   * Test the trace buffer to see if all the elements   * are still sane.   */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)  {  	unsigned long flags, cnt = 0;  	int cpu, ret = 0;  	/* Don't allow flipping of max traces now */  	local_irq_save(flags); -	arch_spin_lock(&ftrace_max_lock); +	arch_spin_lock(&buf->tr->max_lock); -	cnt = ring_buffer_entries(tr->buffer); +	cnt = ring_buffer_entries(buf->buffer);  	/*  	 * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,12 +78,12 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)  	 */  	tracing_off();  	for_each_possible_cpu(cpu) { -		ret = trace_test_buffer_cpu(tr, cpu); +		ret = trace_test_buffer_cpu(buf, cpu);  		if (ret)  			break;  	}  	tracing_on(); -	arch_spin_unlock(&ftrace_max_lock); +	arch_spin_unlock(&buf->tr->max_lock);  	local_irq_restore(flags);  	if (count) @@ -101,13 +101,230 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)  #ifdef CONFIG_DYNAMIC_FTRACE +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, +					    unsigned long pip, +					    struct ftrace_ops *op, +					    struct pt_regs *pt_regs) +{ +	trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, +					    unsigned long pip, +					    struct ftrace_ops *op, +					    struct pt_regs *pt_regs) +{ +	trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, +					    unsigned long pip, +					    struct ftrace_ops *op, +					    struct pt_regs *pt_regs) +{ +	trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, +					    unsigned long pip, +					    struct ftrace_ops *op, +					    struct pt_regs *pt_regs) +{ +	trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, +					 unsigned long pip, +					 struct ftrace_ops *op, +					 struct pt_regs *pt_regs) +{ +	trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { +	.func			= trace_selftest_test_probe1_func, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static struct ftrace_ops test_probe2 = { +	.func			= trace_selftest_test_probe2_func, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static struct ftrace_ops test_probe3 = { +	.func			= trace_selftest_test_probe3_func, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static void print_counts(void) +{ +	printk("(%d %d %d %d %d) ", +	       trace_selftest_test_probe1_cnt, +	       trace_selftest_test_probe2_cnt, +	       trace_selftest_test_probe3_cnt, +	       trace_selftest_test_global_cnt, +	       trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ +	trace_selftest_test_probe1_cnt = 0; +	trace_selftest_test_probe2_cnt = 0; +	trace_selftest_test_probe3_cnt = 0; +	trace_selftest_test_global_cnt = 0; +	trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(struct trace_array *tr, int cnt) +{ +	int save_ftrace_enabled = ftrace_enabled; +	struct ftrace_ops *dyn_ops; +	char *func1_name; +	char *func2_name; +	int len1; +	int len2; +	int ret = -1; + +	printk(KERN_CONT "PASSED\n"); +	pr_info("Testing dynamic ftrace ops #%d: ", cnt); + +	ftrace_enabled = 1; +	reset_counts(); + +	/* Handle PPC64 '.' name */ +	func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); +	func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); +	len1 = strlen(func1_name); +	len2 = strlen(func2_name); + +	/* +	 * Probe 1 will trace function 1. +	 * Probe 2 will trace function 2. +	 * Probe 3 will trace functions 1 and 2. +	 */ +	ftrace_set_filter(&test_probe1, func1_name, len1, 1); +	ftrace_set_filter(&test_probe2, func2_name, len2, 1); +	ftrace_set_filter(&test_probe3, func1_name, len1, 1); +	ftrace_set_filter(&test_probe3, func2_name, len2, 0); + +	register_ftrace_function(&test_probe1); +	register_ftrace_function(&test_probe2); +	register_ftrace_function(&test_probe3); +	/* First time we are running with main function */ +	if (cnt > 1) { +		ftrace_init_array_ops(tr, trace_selftest_test_global_func); +		register_ftrace_function(tr->ops); +	} + +	DYN_FTRACE_TEST_NAME(); + +	print_counts(); + +	if (trace_selftest_test_probe1_cnt != 1) +		goto out; +	if (trace_selftest_test_probe2_cnt != 0) +		goto out; +	if (trace_selftest_test_probe3_cnt != 1) +		goto out; +	if (cnt > 1) { +		if (trace_selftest_test_global_cnt == 0) +			goto out; +	} + +	DYN_FTRACE_TEST_NAME2(); + +	print_counts(); + +	if (trace_selftest_test_probe1_cnt != 1) +		goto out; +	if (trace_selftest_test_probe2_cnt != 1) +		goto out; +	if (trace_selftest_test_probe3_cnt != 2) +		goto out; + +	/* Add a dynamic probe */ +	dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); +	if (!dyn_ops) { +		printk("MEMORY ERROR "); +		goto out; +	} + +	dyn_ops->func = trace_selftest_test_dyn_func; + +	register_ftrace_function(dyn_ops); + +	trace_selftest_test_global_cnt = 0; + +	DYN_FTRACE_TEST_NAME(); + +	print_counts(); + +	if (trace_selftest_test_probe1_cnt != 2) +		goto out_free; +	if (trace_selftest_test_probe2_cnt != 1) +		goto out_free; +	if (trace_selftest_test_probe3_cnt != 3) +		goto out_free; +	if (cnt > 1) { +		if (trace_selftest_test_global_cnt == 0) +			goto out; +	} +	if (trace_selftest_test_dyn_cnt == 0) +		goto out_free; + +	DYN_FTRACE_TEST_NAME2(); + +	print_counts(); + +	if (trace_selftest_test_probe1_cnt != 2) +		goto out_free; +	if (trace_selftest_test_probe2_cnt != 2) +		goto out_free; +	if (trace_selftest_test_probe3_cnt != 4) +		goto out_free; + +	ret = 0; + out_free: +	unregister_ftrace_function(dyn_ops); +	kfree(dyn_ops); + + out: +	/* Purposely unregister in the same order */ +	unregister_ftrace_function(&test_probe1); +	unregister_ftrace_function(&test_probe2); +	unregister_ftrace_function(&test_probe3); +	if (cnt > 1) +		unregister_ftrace_function(tr->ops); +	ftrace_reset_array_ops(tr); + +	/* Make sure everything is off */ +	reset_counts(); +	DYN_FTRACE_TEST_NAME(); +	DYN_FTRACE_TEST_NAME(); + +	if (trace_selftest_test_probe1_cnt || +	    trace_selftest_test_probe2_cnt || +	    trace_selftest_test_probe3_cnt || +	    trace_selftest_test_global_cnt || +	    trace_selftest_test_dyn_cnt) +		ret = -1; + +	ftrace_enabled = save_ftrace_enabled; + +	return ret; +} +  /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, -					   struct trace_array *tr, -					   int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, +						  struct trace_array *tr, +						  int (*func)(void))  {  	int save_ftrace_enabled = ftrace_enabled; -	int save_tracer_enabled = tracer_enabled;  	unsigned long count;  	char *func_name;  	int ret; @@ -118,7 +335,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	/* enable tracing, and record the filter function */  	ftrace_enabled = 1; -	tracer_enabled = 1;  	/* passed in by parameter to fool gcc from optimizing */  	func(); @@ -131,7 +347,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);  	/* filter only on our function */ -	ftrace_set_filter(func_name, strlen(func_name), 1); +	ftrace_set_global_filter(func_name, strlen(func_name), 1);  	/* enable tracing */  	ret = tracer_init(trace, tr); @@ -144,7 +360,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	msleep(100);  	/* we should have nothing in the buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	if (ret)  		goto out; @@ -165,49 +381,289 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); -	trace->reset(tr); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	tracing_start();  	/* we should only have one item */  	if (!ret && count != 1) { +		trace->reset(tr);  		printk(KERN_CONT ".. filter failed count=%ld ..", count);  		ret = -1;  		goto out;  	} +	/* Test the ops with global tracing running */ +	ret = trace_selftest_ops(tr, 1); +	trace->reset(tr); +   out:  	ftrace_enabled = save_ftrace_enabled; -	tracer_enabled = save_tracer_enabled;  	/* Enable tracing on all functions again */ -	ftrace_set_filter(NULL, 0, 1); +	ftrace_set_global_filter(NULL, 0, 1); + +	/* Test the ops with global tracing off */ +	if (!ret) +		ret = trace_selftest_ops(tr, 2); + +	return ret; +} + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, +					       unsigned long pip, +					       struct ftrace_ops *op, +					       struct pt_regs *pt_regs) +{ +	/* +	 * This function is registered without the recursion safe flag. +	 * The ftrace infrastructure should provide the recursion +	 * protection. If not, this will crash the kernel! +	 */ +	if (trace_selftest_recursion_cnt++ > 10) +		return; +	DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, +						    unsigned long pip, +						    struct ftrace_ops *op, +						    struct pt_regs *pt_regs) +{ +	/* +	 * We said we would provide our own recursion. By calling +	 * this function again, we should recurse back into this function +	 * and count again. But this only happens if the arch supports +	 * all of ftrace features and nothing else is using the function +	 * tracing utility. +	 */ +	if (trace_selftest_recursion_cnt++) +		return; +	DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { +	.func			= trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { +	.func			= trace_selftest_test_recursion_safe_func, +	.flags			= FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ +	int save_ftrace_enabled = ftrace_enabled; +	char *func_name; +	int len; +	int ret; + +	/* The previous test PASSED */ +	pr_cont("PASSED\n"); +	pr_info("Testing ftrace recursion: "); + + +	/* enable tracing, and record the filter function */ +	ftrace_enabled = 1; + +	/* Handle PPC64 '.' name */ +	func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); +	len = strlen(func_name); + +	ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); +	if (ret) { +		pr_cont("*Could not set filter* "); +		goto out; +	} + +	ret = register_ftrace_function(&test_rec_probe); +	if (ret) { +		pr_cont("*could not register callback* "); +		goto out; +	} + +	DYN_FTRACE_TEST_NAME(); + +	unregister_ftrace_function(&test_rec_probe); + +	ret = -1; +	if (trace_selftest_recursion_cnt != 1) { +		pr_cont("*callback not called once (%d)* ", +			trace_selftest_recursion_cnt); +		goto out; +	} + +	trace_selftest_recursion_cnt = 1; + +	pr_cont("PASSED\n"); +	pr_info("Testing ftrace recursion safe: "); + +	ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); +	if (ret) { +		pr_cont("*Could not set filter* "); +		goto out; +	} + +	ret = register_ftrace_function(&test_recsafe_probe); +	if (ret) { +		pr_cont("*could not register callback* "); +		goto out; +	} + +	DYN_FTRACE_TEST_NAME(); + +	unregister_ftrace_function(&test_recsafe_probe); + +	ret = -1; +	if (trace_selftest_recursion_cnt != 2) { +		pr_cont("*callback not called expected 2 times (%d)* ", +			trace_selftest_recursion_cnt); +		goto out; +	} + +	ret = 0; +out: +	ftrace_enabled = save_ftrace_enabled;  	return ret;  }  #else  # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; })  #endif /* CONFIG_DYNAMIC_FTRACE */ +static enum { +	TRACE_SELFTEST_REGS_START, +	TRACE_SELFTEST_REGS_FOUND, +	TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, +					  unsigned long pip, +					  struct ftrace_ops *op, +					  struct pt_regs *pt_regs) +{ +	if (pt_regs) +		trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; +	else +		trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { +	.func		= trace_selftest_test_regs_func, +	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ +	int save_ftrace_enabled = ftrace_enabled; +	char *func_name; +	int len; +	int ret; +	int supported = 0; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +	supported = 1; +#endif + +	/* The previous test PASSED */ +	pr_cont("PASSED\n"); +	pr_info("Testing ftrace regs%s: ", +		!supported ? "(no arch support)" : ""); + +	/* enable tracing, and record the filter function */ +	ftrace_enabled = 1; + +	/* Handle PPC64 '.' name */ +	func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); +	len = strlen(func_name); + +	ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); +	/* +	 * If DYNAMIC_FTRACE is not set, then we just trace all functions. +	 * This test really doesn't care. +	 */ +	if (ret && ret != -ENODEV) { +		pr_cont("*Could not set filter* "); +		goto out; +	} + +	ret = register_ftrace_function(&test_regs_probe); +	/* +	 * Now if the arch does not support passing regs, then this should +	 * have failed. +	 */ +	if (!supported) { +		if (!ret) { +			pr_cont("*registered save-regs without arch support* "); +			goto out; +		} +		test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; +		ret = register_ftrace_function(&test_regs_probe); +	} +	if (ret) { +		pr_cont("*could not register callback* "); +		goto out; +	} + + +	DYN_FTRACE_TEST_NAME(); + +	unregister_ftrace_function(&test_regs_probe); + +	ret = -1; + +	switch (trace_selftest_regs_stat) { +	case TRACE_SELFTEST_REGS_START: +		pr_cont("*callback never called* "); +		goto out; + +	case TRACE_SELFTEST_REGS_FOUND: +		if (supported) +			break; +		pr_cont("*callback received regs without arch support* "); +		goto out; + +	case TRACE_SELFTEST_REGS_NOT_FOUND: +		if (!supported) +			break; +		pr_cont("*callback received NULL regs* "); +		goto out; +	} + +	ret = 0; +out: +	ftrace_enabled = save_ftrace_enabled; + +	return ret; +} +  /*   * Simple verification test of ftrace function tracer.   * Enable ftrace, sleep 1/10 second, and then read the trace   * buffer to see if all is in order.   */ -int +__init int  trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  {  	int save_ftrace_enabled = ftrace_enabled; -	int save_tracer_enabled = tracer_enabled;  	unsigned long count;  	int ret; +#ifdef CONFIG_DYNAMIC_FTRACE +	if (ftrace_filter_param) { +		printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); +		return 0; +	} +#endif +  	/* make sure msleep has been recorded */  	msleep(1);  	/* start the tracing */  	ftrace_enabled = 1; -	tracer_enabled = 1;  	ret = tracer_init(trace, tr);  	if (ret) { @@ -222,7 +678,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  	ftrace_enabled = 0;  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -234,10 +690,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  	ret = trace_selftest_startup_dynamic_tracing(trace, tr,  						     DYN_FTRACE_TEST_NAME); +	if (ret) +		goto out; +	ret = trace_selftest_function_recursion(); +	if (ret) +		goto out; + +	ret = trace_selftest_function_regs();   out:  	ftrace_enabled = save_ftrace_enabled; -	tracer_enabled = save_tracer_enabled;  	/* kill ftrace totally if we failed */  	if (ret) @@ -253,8 +715,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  /* Maximum number of functions to trace before diagnosing a hang */  #define GRAPH_MAX_FUNC_TEST	100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);  static unsigned int graph_hang_thresh;  /* Wrap the real function entry probe to avoid possible hanging */ @@ -264,8 +724,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)  	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {  		ftrace_graph_stop();  		printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); -		if (ftrace_dump_on_oops) -			__ftrace_dump(false, DUMP_ALL); +		if (ftrace_dump_on_oops) { +			ftrace_dump(DUMP_ALL); +			/* ftrace_dump() disables tracing */ +			tracing_on(); +		}  		return 0;  	} @@ -276,18 +739,25 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)   * Pretty much the same than for the function tracer from which the selftest   * has been borrowed.   */ -int +__init int  trace_selftest_startup_function_graph(struct tracer *trace,  					struct trace_array *tr)  {  	int ret;  	unsigned long count; +#ifdef CONFIG_DYNAMIC_FTRACE +	if (ftrace_filter_param) { +		printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); +		return 0; +	} +#endif +  	/*  	 * Simulate the init() callback but we attach a watchdog callback  	 * to detect and recover from possible hangs  	 */ -	tracing_reset_online_cpus(tr); +	tracing_reset_online_cpus(&tr->trace_buffer);  	set_graph_array(tr);  	ret = register_ftrace_graph(&trace_graph_return,  				    &trace_graph_entry_watchdog); @@ -310,7 +780,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -337,7 +807,7 @@ out:  int  trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	unsigned long count;  	int ret; @@ -349,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	/* disable interrupts for a bit */  	local_irq_disable();  	udelay(100); @@ -365,9 +835,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -376,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  		ret = -1;  	} -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	return ret;  } @@ -386,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)  int  trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	unsigned long count;  	int ret; @@ -411,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	/* disable preemption for a bit */  	preempt_disable();  	udelay(100); @@ -427,9 +897,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -438,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  		ret = -1;  	} -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	return ret;  } @@ -448,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)  int  trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	unsigned long count;  	int ret; @@ -473,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	/* disable preemption and interrupts for a bit */  	preempt_disable(); @@ -493,11 +963,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (ret)  		goto out; @@ -508,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	}  	/* do the test by disabling interrupts first this time */ -	tracing_max_latency = 0; +	tr->max_latency = 0;  	tracing_start();  	trace->start(tr); @@ -523,11 +993,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL);  	if (ret)  		goto out; -	ret = trace_test_buffer(&max_tr, &count); +	ret = trace_test_buffer(&tr->max_buffer, &count);  	if (!ret && !count) {  		printk(KERN_CONT ".. no entries found .."); @@ -539,7 +1009,7 @@ out:  	tracing_start();  out_no_start:  	trace->reset(tr); -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	return ret;  } @@ -557,11 +1027,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)  #ifdef CONFIG_SCHED_TRACER  static int trace_wakeup_test_thread(void *data)  { -	/* Make this a RT thread, doesn't need to be too high */ -	struct sched_param param = { .sched_priority = 5 }; +	/* Make this a -deadline thread */ +	static const struct sched_attr attr = { +		.sched_policy = SCHED_DEADLINE, +		.sched_runtime = 100000ULL, +		.sched_deadline = 10000000ULL, +		.sched_period = 10000000ULL +	};  	struct completion *x = data; -	sched_setscheduler(current, SCHED_FIFO, ¶m); +	sched_setattr(current, &attr);  	/* Make it know we have a new prio */  	complete(x); @@ -570,11 +1045,13 @@ static int trace_wakeup_test_thread(void *data)  	set_current_state(TASK_INTERRUPTIBLE);  	schedule(); +	complete(x); +  	/* we are awake, now wait to disappear */  	while (!kthread_should_stop()) {  		/* -		 * This is an RT task, do short sleeps to let -		 * others run. +		 * This will likely be the system top priority +		 * task, do short sleeps to let others run.  		 */  		msleep(100);  	} @@ -585,23 +1062,23 @@ static int trace_wakeup_test_thread(void *data)  int  trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  { -	unsigned long save_max = tracing_max_latency; +	unsigned long save_max = tr->max_latency;  	struct task_struct *p; -	struct completion isrt; +	struct completion is_ready;  	unsigned long count;  	int ret; -	init_completion(&isrt); +	init_completion(&is_ready); -	/* create a high prio thread */ -	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); +	/* create a -deadline thread */ +	p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");  	if (IS_ERR(p)) {  		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");  		return -1;  	} -	/* make sure the thread is running at an RT prio */ -	wait_for_completion(&isrt); +	/* make sure the thread is running at -deadline policy */ +	wait_for_completion(&is_ready);  	/* start the tracing */  	ret = tracer_init(trace, tr); @@ -611,39 +1088,37 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)  	}  	/* reset the max latency */ -	tracing_max_latency = 0; +	tr->max_latency = 0; -	/* sleep to let the RT thread sleep too */ -	msleep(100); +	while (p->on_rq) { +		/* +		 * Sleep to make sure the -deadline thread is asleep too. +		 * On virtual machines we can't rely on timings, +		 * but we want to make sure this test still works. +		 */ +		msleep(100); +	} -	/* -	 * Yes this is slightly racy. It is possible that for some -	 * strange reason that the RT thread we created, did not -	 * call schedule for 100ms after doing the completion, -	 * and we do a wakeup on a task that already is awake. -	 * But that is extremely unlikely, and the worst thing that -	 * happens in such a case, is that we disable tracing. -	 * Honestly, if this race does happen something is horrible -	 * wrong with the system. -	 */ +	init_completion(&is_ready);  	wake_up_process(p); -	/* give a little time to let the thread wake up */ -	msleep(100); +	/* Wait for the task to wake up */ +	wait_for_completion(&is_ready);  	/* stop the tracing. */  	tracing_stop();  	/* check both trace buffers */ -	ret = trace_test_buffer(tr, NULL); +	ret = trace_test_buffer(&tr->trace_buffer, NULL); +	printk("ret = %d\n", ret);  	if (!ret) -		ret = trace_test_buffer(&max_tr, &count); +		ret = trace_test_buffer(&tr->max_buffer, &count);  	trace->reset(tr);  	tracing_start(); -	tracing_max_latency = save_max; +	tr->max_latency = save_max;  	/* kill the thread */  	kthread_stop(p); @@ -676,7 +1151,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr  	/* stop the tracing. */  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); @@ -708,7 +1183,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)  	/* stop the tracing. */  	tracing_stop();  	/* check the trace buffer */ -	ret = trace_test_buffer(tr, &count); +	ret = trace_test_buffer(&tr->trace_buffer, &count);  	trace->reset(tr);  	tracing_start(); diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5b..b4c475a0a48 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)  	/* used to call mcount */  	return 0;  } + +int DYN_FTRACE_TEST_NAME2(void) +{ +	/* used to call mcount */ +	return 0; +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead0c23..8a4e5cb66a4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,65 +13,122 @@  #include <linux/sysctl.h>  #include <linux/init.h>  #include <linux/fs.h> +#include <linux/magic.h> + +#include <asm/setup.h> +  #include "trace.h"  #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry		1 +#else +# define fentry		0 +#endif +  static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =  	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };  static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */  static struct stack_trace max_stack_trace = { -	.max_entries		= STACK_TRACE_ENTRIES, -	.entries		= stack_dump_trace, +	.max_entries		= STACK_TRACE_ENTRIES - 1, +	.entries		= &stack_dump_trace[1],  };  static unsigned long max_stack_size;  static arch_spinlock_t max_stack_lock =  	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static int stack_trace_disabled __read_mostly;  static DEFINE_PER_CPU(int, trace_active);  static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled;  static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void print_max_stack(void) +{ +	long i; +	int size; + +	pr_emerg("        Depth    Size   Location    (%d entries)\n" +			   "        -----    ----   --------\n", +			   max_stack_trace.nr_entries - 1); + +	for (i = 0; i < max_stack_trace.nr_entries; i++) { +		if (stack_dump_trace[i] == ULONG_MAX) +			break; +		if (i+1 == max_stack_trace.nr_entries || +				stack_dump_trace[i+1] == ULONG_MAX) +			size = stack_dump_index[i]; +		else +			size = stack_dump_index[i] - stack_dump_index[i+1]; + +		pr_emerg("%3ld) %8d   %5d   %pS\n", i, stack_dump_index[i], +				size, (void *)stack_dump_trace[i]); +	} +} + +static inline void +check_stack(unsigned long ip, unsigned long *stack)  { -	unsigned long this_size, flags; -	unsigned long *p, *top, *start; +	unsigned long this_size, flags; unsigned long *p, *top, *start; +	static int tracer_frame; +	int frame_size = ACCESS_ONCE(tracer_frame);  	int i; -	this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); +	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);  	this_size = THREAD_SIZE - this_size; +	/* Remove the frame of the tracer */ +	this_size -= frame_size;  	if (this_size <= max_stack_size)  		return;  	/* we do not handle interrupt stacks yet */ -	if (!object_is_on_stack(&this_size)) +	if (!object_is_on_stack(stack))  		return;  	local_irq_save(flags);  	arch_spin_lock(&max_stack_lock); +	/* In case another CPU set the tracer_frame on us */ +	if (unlikely(!frame_size)) +		this_size -= tracer_frame; +  	/* a race could have already updated it */  	if (this_size <= max_stack_size)  		goto out;  	max_stack_size = this_size; -	max_stack_trace.nr_entries	= 0; -	max_stack_trace.skip		= 3; +	max_stack_trace.nr_entries = 0; + +	if (using_ftrace_ops_list_func()) +		max_stack_trace.skip = 4; +	else +		max_stack_trace.skip = 3;  	save_stack_trace(&max_stack_trace);  	/* +	 * Add the passed in ip from the function tracer. +	 * Searching for this on the stack will skip over +	 * most of the overhead from the stack tracer itself. +	 */ +	stack_dump_trace[0] = ip; +	max_stack_trace.nr_entries++; + +	/*  	 * Now find where in the stack these are.  	 */  	i = 0; -	start = &this_size; +	start = stack;  	top = (unsigned long *)  		(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -95,6 +152,18 @@ static inline void check_stack(void)  				found = 1;  				/* Start the search from here */  				start = p + 1; +				/* +				 * We do not want to show the overhead +				 * of the stack tracer stack in the +				 * max stack. If we haven't figured +				 * out what that is, then figure it out +				 * now. +				 */ +				if (unlikely(!tracer_frame) && i == 1) { +					tracer_frame = (p - stack) * +						sizeof(unsigned long); +					max_stack_size -= tracer_frame; +				}  			}  		} @@ -102,19 +171,24 @@ static inline void check_stack(void)  			i++;  	} +	if ((current != &init_task && +		*(end_of_stack(current)) != STACK_END_MAGIC)) { +		print_max_stack(); +		BUG(); +	} +   out:  	arch_spin_unlock(&max_stack_lock);  	local_irq_restore(flags);  }  static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) +stack_trace_call(unsigned long ip, unsigned long parent_ip, +		 struct ftrace_ops *op, struct pt_regs *pt_regs)  { +	unsigned long stack;  	int cpu; -	if (unlikely(!ftrace_enabled || stack_trace_disabled)) -		return; -  	preempt_disable_notrace();  	cpu = raw_smp_processor_id(); @@ -122,7 +196,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)  	if (per_cpu(trace_active, cpu)++ != 0)  		goto out; -	check_stack(); +	/* +	 * When fentry is used, the traced function does not get +	 * its stack frame set up, and we lose the parent. +	 * The ip is pretty useless because the function tracer +	 * was called before that function set up its stack frame. +	 * In this case, we use the parent ip. +	 * +	 * By adding the return address of either the parent ip +	 * or the current ip we can disregard most of the stack usage +	 * caused by the stack tracer itself. +	 * +	 * The function tracer always reports the address of where the +	 * mcount call was, but the stack will hold the return address. +	 */ +	if (fentry) +		ip = parent_ip; +	else +		ip += MCOUNT_INSN_SIZE; + +	check_stack(ip, &stack);   out:  	per_cpu(trace_active, cpu)--; @@ -133,6 +226,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)  static struct ftrace_ops trace_ops __read_mostly =  {  	.func = stack_trace_call, +	.flags = FTRACE_OPS_FL_RECURSION_SAFE,  };  static ssize_t @@ -155,20 +249,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,  {  	long *ptr = filp->private_data;  	unsigned long val, flags; -	char buf[64];  	int ret;  	int cpu; -	if (count >= sizeof(buf)) -		return -EINVAL; - -	if (copy_from_user(&buf, ubuf, count)) -		return -EFAULT; - -	buf[count] = 0; - -	ret = strict_strtoul(buf, 10, &val); -	if (ret < 0) +	ret = kstrtoul_from_user(ubuf, count, 10, &val); +	if (ret)  		return ret;  	local_irq_save(flags); @@ -319,6 +404,21 @@ static const struct file_operations stack_trace_fops = {  	.release	= seq_release,  }; +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ +	return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, +				 inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { +	.open = stack_trace_filter_open, +	.read = seq_read, +	.write = ftrace_filter_write, +	.llseek = tracing_lseek, +	.release = ftrace_regex_release, +}; +  int  stack_trace_sysctl(struct ctl_table *table, int write,  		   void __user *buffer, size_t *lenp, @@ -346,8 +446,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,  	return ret;  } +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; +  static __init int enable_stacktrace(char *str)  { +	if (strncmp(str, "_filter=", 8) == 0) +		strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); +  	stack_tracer_enabled = 1;  	last_stack_tracer_enabled = 1;  	return 1; @@ -359,6 +464,8 @@ static __init int stack_trace_init(void)  	struct dentry *d_tracer;  	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0;  	trace_create_file("stack_max_size", 0644, d_tracer,  			&max_stack_size, &stack_max_size_fops); @@ -366,6 +473,12 @@ static __init int stack_trace_init(void)  	trace_create_file("stack_trace", 0444, d_tracer,  			NULL, &stack_trace_fops); +	trace_create_file("stack_trace_filter", 0444, d_tracer, +			NULL, &stack_trace_filter_fops); + +	if (stack_trace_filter_buf[0]) +		ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); +  	if (stack_tracer_enabled)  		register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e7..7af67360b33 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);  /* The root directory for all stat files */  static struct dentry		*stat_dir; -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, -				    struct rb_node *node) +static void __reset_stat_session(struct stat_session *session)  { -	struct stat_node *snode; -	struct rb_node *parent = rb_parent(node); - -	if (node->rb_left) -		return node->rb_left; -	else if (node->rb_right) -		return node->rb_right; -	else { -		if (!parent) -			; -		else if (parent->rb_left == node) -			parent->rb_left = NULL; -		else -			parent->rb_right = NULL; +	struct stat_node *snode, *n; -		snode = container_of(node, struct stat_node, node); -		if (ts->stat_release) -			ts->stat_release(snode->stat); +	rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { +		if (session->ts->stat_release) +			session->ts->stat_release(snode->stat);  		kfree(snode); - -		return parent;  	} -} - -static void __reset_stat_session(struct stat_session *session) -{ -	struct rb_node *node = session->stat_root.rb_node; - -	while (node) -		node = release_next(session->ts, node);  	session->stat_root = RB_ROOT;  } @@ -307,6 +276,8 @@ static int tracing_stat_init(void)  	struct dentry *d_tracing;  	d_tracing = tracing_init_dentry(); +	if (!d_tracing) +		return 0;  	stat_dir = debugfs_create_dir("trace_stat", d_tracing);  	if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb..759d5e00451 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,7 +1,9 @@  #include <trace/syscall.h>  #include <trace/events/syscalls.h> +#include <linux/syscalls.h>  #include <linux/slab.h>  #include <linux/kernel.h> +#include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */  #include <linux/ftrace.h>  #include <linux/perf_event.h>  #include <asm/syscall.h> @@ -10,21 +12,11 @@  #include "trace.h"  static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);  static int syscall_enter_register(struct ftrace_event_call *event, -				 enum trace_reg type); +				 enum trace_reg type, void *data);  static int syscall_exit_register(struct ftrace_event_call *event, -				 enum trace_reg type); - -static int syscall_enter_define_fields(struct ftrace_event_call *call); -static int syscall_exit_define_fields(struct ftrace_event_call *call); - -/* All syscall exit events have the same fields */ -static LIST_HEAD(syscall_exit_fields); +				 enum trace_reg type, void *data);  static struct list_head *  syscall_get_enter_fields(struct ftrace_event_call *call) @@ -34,61 +26,74 @@ syscall_get_enter_fields(struct ftrace_event_call *call)  	return &entry->enter_fields;  } -static struct list_head * -syscall_get_exit_fields(struct ftrace_event_call *call) -{ -	return &syscall_exit_fields; -} - -struct trace_event_functions enter_syscall_print_funcs = { -	.trace                  = print_syscall_enter, -}; +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; -struct trace_event_functions exit_syscall_print_funcs = { -	.trace                  = print_syscall_exit, -}; - -struct ftrace_event_class event_class_syscall_enter = { -	.system			= "syscalls", -	.reg			= syscall_enter_register, -	.define_fields		= syscall_enter_define_fields, -	.get_fields		= syscall_get_enter_fields, -	.raw_init		= init_syscall_trace, -}; +static struct syscall_metadata **syscalls_metadata; -struct ftrace_event_class event_class_syscall_exit = { -	.system			= "syscalls", -	.reg			= syscall_exit_register, -	.define_fields		= syscall_exit_define_fields, -	.get_fields		= syscall_get_exit_fields, -	.raw_init		= init_syscall_trace, -}; +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ +	/* +	 * Only compare after the "sys" prefix. Archs that use +	 * syscall wrappers may have syscalls symbols aliases prefixed +	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted +	 * mismatch. +	 */ +	return !strcmp(sym + 3, name + 3); +} +#endif -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + *     *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ +	if (unlikely(arch_trace_is_compat_syscall(regs))) +		return -1; -static struct syscall_metadata **syscalls_metadata; +	return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ +	return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ -static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +static __init struct syscall_metadata * +find_syscall_meta(unsigned long syscall)  { -	struct syscall_metadata *start; -	struct syscall_metadata *stop; +	struct syscall_metadata **start; +	struct syscall_metadata **stop;  	char str[KSYM_SYMBOL_LEN]; -	start = (struct syscall_metadata *)__start_syscalls_metadata; -	stop = (struct syscall_metadata *)__stop_syscalls_metadata; +	start = __start_syscalls_metadata; +	stop = __stop_syscalls_metadata;  	kallsyms_lookup(syscall, NULL, NULL, NULL, str); +	if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) +		return NULL; +  	for ( ; start < stop; start++) { -		/* -		 * Only compare after the "sys" prefix. Archs that use -		 * syscall wrappers may have syscalls symbols aliases prefixed -		 * with "SyS" instead of "sys", leading to an unwanted -		 * mismatch. -		 */ -		if (start->name && !strcmp(start->name + 3, str + 3)) -			return start; +		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) +			return *start;  	}  	return NULL;  } @@ -101,7 +106,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)  	return syscalls_metadata[nr];  } -enum print_line_t +static enum print_line_t  print_syscall_enter(struct trace_iterator *iter, int flags,  		    struct trace_event *event)  { @@ -154,7 +159,7 @@ end:  	return TRACE_TYPE_HANDLED;  } -enum print_line_t +static enum print_line_t  print_syscall_exit(struct trace_iterator *iter, int flags,  		   struct trace_event *event)  { @@ -170,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,  	entry = syscall_nr_to_meta(syscall);  	if (!entry) { -		trace_seq_printf(s, "\n"); +		trace_seq_putc(s, '\n');  		return TRACE_TYPE_HANDLED;  	} @@ -195,8 +200,8 @@ extern char *__bad_type_size(void);  		#type, #name, offsetof(typeof(trace), name),		\  		sizeof(trace.name), is_signed_type(type) -static -int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +static int __init +__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)  {  	int i;  	int pos = 0; @@ -223,7 +228,7 @@ int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)  	return pos;  } -static int set_syscall_print_fmt(struct ftrace_event_call *call) +static int __init set_syscall_print_fmt(struct ftrace_event_call *call)  {  	char *print_fmt;  	int len; @@ -248,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call)  	return 0;  } -static void free_syscall_print_fmt(struct ftrace_event_call *call) +static void __init free_syscall_print_fmt(struct ftrace_event_call *call)  {  	struct syscall_metadata *entry = call->data; @@ -256,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)  		kfree(call->print_fmt);  } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_enter trace;  	struct syscall_metadata *meta = call->data; @@ -279,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)  	return ret;  } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call)  {  	struct syscall_trace_exit trace;  	int ret; @@ -294,19 +299,29 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)  	return ret;  } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  { +	struct trace_array *tr = data; +	struct ftrace_event_file *ftrace_file;  	struct syscall_trace_enter *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; -	int size; +	unsigned long irq_flags; +	int pc;  	int syscall_nr; +	int size; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_enter_syscalls)) + +	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ +	ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); +	if (!ftrace_file) +		return; + +	if (ftrace_trigger_soft_disabled(ftrace_file))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr); @@ -315,8 +330,12 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; -	event = trace_current_buffer_lock_reserve(&buffer, -			sys_data->enter_event->event.type, size, 0, 0); +	local_save_flags(irq_flags); +	pc = preempt_count(); + +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, +			sys_data->enter_event->event.type, size, irq_flags, pc);  	if (!event)  		return; @@ -324,31 +343,45 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	entry->nr = syscall_nr;  	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); -	if (!filter_current_check_discard(buffer, sys_data->enter_event, -					  entry, event)) -		trace_current_buffer_unlock_commit(buffer, event, 0, 0); +	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, +				    irq_flags, pc);  } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  { +	struct trace_array *tr = data; +	struct ftrace_event_file *ftrace_file;  	struct syscall_trace_exit *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; +	unsigned long irq_flags; +	int pc;  	int syscall_nr; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, enabled_exit_syscalls)) + +	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ +	ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); +	if (!ftrace_file) +		return; + +	if (ftrace_trigger_soft_disabled(ftrace_file))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr);  	if (!sys_data)  		return; -	event = trace_current_buffer_lock_reserve(&buffer, -			sys_data->exit_event->event.type, sizeof(*entry), 0, 0); +	local_save_flags(irq_flags); +	pc = preempt_count(); + +	buffer = tr->trace_buffer.buffer; +	event = trace_buffer_lock_reserve(buffer, +			sys_data->exit_event->event.type, sizeof(*entry), +			irq_flags, pc);  	if (!event)  		return; @@ -356,82 +389,97 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	entry->nr = syscall_nr;  	entry->ret = syscall_get_return_value(current, regs); -	if (!filter_current_check_discard(buffer, sys_data->exit_event, -					  entry, event)) -		trace_current_buffer_unlock_commit(buffer, event, 0, 0); +	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, +				    irq_flags, pc);  } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, +				   struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr; -	if (num < 0 || num >= NR_syscalls) +	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_enter) -		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); +	if (!tr->sys_refcount_enter) +		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);  	if (!ret) { -		set_bit(num, enabled_enter_syscalls); -		sys_refcount_enter++; +		rcu_assign_pointer(tr->enter_syscall_files[num], file); +		tr->sys_refcount_enter++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, +				      struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr; -	if (num < 0 || num >= NR_syscalls) +	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_enter--; -	clear_bit(num, enabled_enter_syscalls); -	if (!sys_refcount_enter) -		unregister_trace_sys_enter(ftrace_syscall_enter, NULL); +	tr->sys_refcount_enter--; +	rcu_assign_pointer(tr->enter_syscall_files[num], NULL); +	if (!tr->sys_refcount_enter) +		unregister_trace_sys_enter(ftrace_syscall_enter, tr);  	mutex_unlock(&syscall_trace_lock);  } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, +				  struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int ret = 0;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr; -	if (num < 0 || num >= NR_syscalls) +	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return -ENOSYS;  	mutex_lock(&syscall_trace_lock); -	if (!sys_refcount_exit) -		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); +	if (!tr->sys_refcount_exit) +		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);  	if (!ret) { -		set_bit(num, enabled_exit_syscalls); -		sys_refcount_exit++; +		rcu_assign_pointer(tr->exit_syscall_files[num], file); +		tr->sys_refcount_exit++;  	}  	mutex_unlock(&syscall_trace_lock);  	return ret;  } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, +				     struct ftrace_event_call *call)  { +	struct trace_array *tr = file->tr;  	int num;  	num = ((struct syscall_metadata *)call->data)->syscall_nr; -	if (num < 0 || num >= NR_syscalls) +	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))  		return;  	mutex_lock(&syscall_trace_lock); -	sys_refcount_exit--; -	clear_bit(num, enabled_exit_syscalls); -	if (!sys_refcount_exit) -		unregister_trace_sys_exit(ftrace_syscall_exit, NULL); +	tr->sys_refcount_exit--; +	rcu_assign_pointer(tr->exit_syscall_files[num], NULL); +	if (!tr->sys_refcount_exit) +		unregister_trace_sys_exit(ftrace_syscall_exit, tr);  	mutex_unlock(&syscall_trace_lock);  } -int init_syscall_trace(struct ftrace_event_call *call) +static int __init init_syscall_trace(struct ftrace_event_call *call)  {  	int id; +	int num; + +	num = ((struct syscall_metadata *)call->data)->syscall_nr; +	if (num < 0 || num >= NR_syscalls) { +		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", +				((struct syscall_metadata *)call->data)->name); +		return -ENOSYS; +	}  	if (set_syscall_print_fmt(call) < 0)  		return -ENOMEM; @@ -446,19 +494,43 @@ int init_syscall_trace(struct ftrace_event_call *call)  	return id;  } -unsigned long __init arch_syscall_addr(int nr) +struct trace_event_functions enter_syscall_print_funcs = { +	.trace		= print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { +	.trace		= print_syscall_exit, +}; + +struct ftrace_event_class __refdata event_class_syscall_enter = { +	.system		= "syscalls", +	.reg		= syscall_enter_register, +	.define_fields	= syscall_enter_define_fields, +	.get_fields	= syscall_get_enter_fields, +	.raw_init	= init_syscall_trace, +}; + +struct ftrace_event_class __refdata event_class_syscall_exit = { +	.system		= "syscalls", +	.reg		= syscall_exit_register, +	.define_fields	= syscall_exit_define_fields, +	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields), +	.raw_init	= init_syscall_trace, +}; + +unsigned long __init __weak arch_syscall_addr(int nr)  {  	return (unsigned long)sys_call_table[nr];  } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void)  {  	struct syscall_metadata *meta;  	unsigned long addr;  	int i; -	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * -					NR_syscalls, GFP_KERNEL); +	syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), +				    GFP_KERNEL);  	if (!syscalls_metadata) {  		WARN_ON(1);  		return -ENOMEM; @@ -476,7 +548,7 @@ int __init init_ftrace_syscalls(void)  	return 0;  } -core_initcall(init_ftrace_syscalls); +early_initcall(init_ftrace_syscalls);  #ifdef CONFIG_PERF_EVENTS @@ -494,7 +566,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	int rctx;  	int size; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs); +	if (syscall_nr < 0) +		return;  	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))  		return; @@ -502,15 +576,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	if (!sys_data)  		return; +	head = this_cpu_ptr(sys_data->enter_event->perf_events); +	if (hlist_empty(head)) +		return; +  	/* get the size after alignment with the u32 buffer size field */  	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);  	size = ALIGN(size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		      "perf buffer not large enough")) -		return; -  	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,  				sys_data->enter_event->event.type, regs, &rctx);  	if (!rec) @@ -519,12 +593,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	rec->nr = syscall_nr;  	syscall_get_arguments(current, regs, 0, sys_data->nb_args,  			       (unsigned long *)&rec->args); - -	head = this_cpu_ptr(sys_data->enter_event->perf_events); -	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); +	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);  } -int perf_sysenter_enable(struct ftrace_event_call *call) +static int perf_sysenter_enable(struct ftrace_event_call *call)  {  	int ret = 0;  	int num; @@ -545,7 +617,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)  	return ret;  } -void perf_sysenter_disable(struct ftrace_event_call *call) +static void perf_sysenter_disable(struct ftrace_event_call *call)  {  	int num; @@ -568,7 +640,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	int rctx;  	int size; -	syscall_nr = syscall_get_nr(current, regs); +	syscall_nr = trace_get_syscall_nr(current, regs); +	if (syscall_nr < 0) +		return;  	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))  		return; @@ -576,18 +650,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	if (!sys_data)  		return; +	head = this_cpu_ptr(sys_data->exit_event->perf_events); +	if (hlist_empty(head)) +		return; +  	/* We can probably do that at build time */  	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	/* -	 * Impossible, but be paranoid with the future -	 * How to put this check outside runtime? -	 */ -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		"exit event has grown above perf buffer size")) -		return; -  	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,  				sys_data->exit_event->event.type, regs, &rctx);  	if (!rec) @@ -595,12 +665,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	rec->nr = syscall_nr;  	rec->ret = syscall_get_return_value(current, regs); - -	head = this_cpu_ptr(sys_data->exit_event->perf_events); -	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); +	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);  } -int perf_sysexit_enable(struct ftrace_event_call *call) +static int perf_sysexit_enable(struct ftrace_event_call *call)  {  	int ret = 0;  	int num; @@ -621,7 +689,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)  	return ret;  } -void perf_sysexit_disable(struct ftrace_event_call *call) +static void perf_sysexit_disable(struct ftrace_event_call *call)  {  	int num; @@ -638,13 +706,15 @@ void perf_sysexit_disable(struct ftrace_event_call *call)  #endif /* CONFIG_PERF_EVENTS */  static int syscall_enter_register(struct ftrace_event_call *event, -				 enum trace_reg type) +				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_enter(event); +		return reg_event_syscall_enter(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_enter(event); +		unreg_event_syscall_enter(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -653,19 +723,26 @@ static int syscall_enter_register(struct ftrace_event_call *event,  	case TRACE_REG_PERF_UNREGISTER:  		perf_sysenter_disable(event);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0;  }  static int syscall_exit_register(struct ftrace_event_call *event, -				 enum trace_reg type) +				 enum trace_reg type, void *data)  { +	struct ftrace_event_file *file = data; +  	switch (type) {  	case TRACE_REG_REGISTER: -		return reg_event_syscall_exit(event); +		return reg_event_syscall_exit(file, event);  	case TRACE_REG_UNREGISTER: -		unreg_event_syscall_exit(event); +		unreg_event_syscall_exit(file, event);  		return 0;  #ifdef CONFIG_PERF_EVENTS @@ -674,6 +751,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,  	case TRACE_REG_PERF_UNREGISTER:  		perf_sysexit_disable(event);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 00000000000..3c9b97e6b1f --- /dev/null +++ b/kernel/trace/trace_uprobe.c @@ -0,0 +1,1340 @@ +/* + * uprobes-based tracing events + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + * + * Copyright (C) IBM Corporation, 2010-2012 + * Author:	Srikar Dronamraju <srikar@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include <linux/namei.h> +#include <linux/string.h> + +#include "trace_probe.h" + +#define UPROBE_EVENT_SYSTEM	"uprobes" + +struct uprobe_trace_entry_head { +	struct trace_entry	ent; +	unsigned long		vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return)			\ +	(sizeof(struct uprobe_trace_entry_head) +	\ +	 sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return)		\ +	((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + +struct trace_uprobe_filter { +	rwlock_t		rwlock; +	int			nr_systemwide; +	struct list_head	perf_events; +}; + +/* + * uprobe event core functions + */ +struct trace_uprobe { +	struct list_head		list; +	struct trace_uprobe_filter	filter; +	struct uprobe_consumer		consumer; +	struct inode			*inode; +	char				*filename; +	unsigned long			offset; +	unsigned long			nhit; +	struct trace_probe		tp; +}; + +#define SIZEOF_TRACE_UPROBE(n)				\ +	(offsetof(struct trace_uprobe, tp.args) +	\ +	(sizeof(struct probe_arg) * (n))) + +static int register_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu); + +static DEFINE_MUTEX(uprobe_lock); +static LIST_HEAD(uprobe_list); + +struct uprobe_dispatch_data { +	struct trace_uprobe	*tu; +	unsigned long		bp_addr; +}; + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, +				unsigned long func, struct pt_regs *regs); + +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ +	return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ +	return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ +	unsigned long ret; +	unsigned long addr = user_stack_pointer(regs); + +	addr = adjust_stack_addr(addr, n); + +	if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) +		return 0; + +	return ret; +} + +/* + * Uprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type)					\ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,		\ +					 void *offset, void *dest)	\ +{									\ +	*(type *)dest = (type)get_user_stack_nth(regs,			\ +					      ((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string	NULL +#define fetch_stack_string_size	NULL + +#define DEFINE_FETCH_memory(type)					\ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,		\ +					  void *addr, void *dest)	\ +{									\ +	type retval;							\ +	void __user *vaddr = (void __force __user *) addr;		\ +									\ +	if (copy_from_user(&retval, vaddr, sizeof(type)))		\ +		*(type *)dest = 0;					\ +	else								\ +		*(type *) dest = retval;				\ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, +					    void *addr, void *dest) +{ +	long ret; +	u32 rloc = *(u32 *)dest; +	int maxlen  = get_rloc_len(rloc); +	u8 *dst = get_rloc_data(dest); +	void __user *src = (void __force __user *) addr; + +	if (!maxlen) +		return; + +	ret = strncpy_from_user(dst, src, maxlen); + +	if (ret < 0) {	/* Failed to fetch string */ +		((u8 *)get_rloc_data(dest))[0] = '\0'; +		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); +	} else { +		*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); +	} +} + +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, +						 void *addr, void *dest) +{ +	int len; +	void __user *vaddr = (void __force __user *) addr; + +	len = strnlen_user(vaddr, MAX_STRING_SIZE); + +	if (len == 0 || len > MAX_STRING_SIZE)  /* Failed to check length */ +		*(u32 *)dest = 0; +	else +		*(u32 *)dest = len; +} + +static unsigned long translate_user_vaddr(void *file_offset) +{ +	unsigned long base_addr; +	struct uprobe_dispatch_data *udd; + +	udd = (void *) current->utask->vaddr; + +	base_addr = udd->bp_addr - udd->tu->offset; +	return base_addr + (unsigned long)file_offset; +} + +#define DEFINE_FETCH_file_offset(type)					\ +static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,	\ +					       void *offset, void *dest)\ +{									\ +	void *vaddr = (void *)translate_user_vaddr(offset);		\ +									\ +	FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest);		\ +} +DEFINE_BASIC_FETCH_FUNCS(file_offset) +DEFINE_FETCH_file_offset(string) +DEFINE_FETCH_file_offset(string_size) + +/* Fetch type information table */ +const struct fetch_type uprobes_fetch_type_table[] = { +	/* Special types */ +	[FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, +					sizeof(u32), 1, "__data_loc char[]"), +	[FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, +					string_size, sizeof(u32), 0, "u32"), +	/* Basic types */ +	ASSIGN_FETCH_TYPE(u8,  u8,  0), +	ASSIGN_FETCH_TYPE(u16, u16, 0), +	ASSIGN_FETCH_TYPE(u32, u32, 0), +	ASSIGN_FETCH_TYPE(u64, u64, 0), +	ASSIGN_FETCH_TYPE(s8,  u8,  1), +	ASSIGN_FETCH_TYPE(s16, u16, 1), +	ASSIGN_FETCH_TYPE(s32, u32, 1), +	ASSIGN_FETCH_TYPE(s64, u64, 1), + +	ASSIGN_FETCH_TYPE_END +}; + +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ +	rwlock_init(&filter->rwlock); +	filter->nr_systemwide = 0; +	INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ +	return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ +	return tu->consumer.ret_handler != NULL; +} + +/* + * Allocate new trace_uprobe and initialize it (including uprobes). + */ +static struct trace_uprobe * +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) +{ +	struct trace_uprobe *tu; + +	if (!event || !is_good_name(event)) +		return ERR_PTR(-EINVAL); + +	if (!group || !is_good_name(group)) +		return ERR_PTR(-EINVAL); + +	tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); +	if (!tu) +		return ERR_PTR(-ENOMEM); + +	tu->tp.call.class = &tu->tp.class; +	tu->tp.call.name = kstrdup(event, GFP_KERNEL); +	if (!tu->tp.call.name) +		goto error; + +	tu->tp.class.system = kstrdup(group, GFP_KERNEL); +	if (!tu->tp.class.system) +		goto error; + +	INIT_LIST_HEAD(&tu->list); +	INIT_LIST_HEAD(&tu->tp.files); +	tu->consumer.handler = uprobe_dispatcher; +	if (is_ret) +		tu->consumer.ret_handler = uretprobe_dispatcher; +	init_trace_uprobe_filter(&tu->filter); +	tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; +	return tu; + +error: +	kfree(tu->tp.call.name); +	kfree(tu); + +	return ERR_PTR(-ENOMEM); +} + +static void free_trace_uprobe(struct trace_uprobe *tu) +{ +	int i; + +	for (i = 0; i < tu->tp.nr_args; i++) +		traceprobe_free_probe_arg(&tu->tp.args[i]); + +	iput(tu->inode); +	kfree(tu->tp.call.class->system); +	kfree(tu->tp.call.name); +	kfree(tu->filename); +	kfree(tu); +} + +static struct trace_uprobe *find_probe_event(const char *event, const char *group) +{ +	struct trace_uprobe *tu; + +	list_for_each_entry(tu, &uprobe_list, list) +		if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && +		    strcmp(tu->tp.call.class->system, group) == 0) +			return tu; + +	return NULL; +} + +/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ +static int unregister_trace_uprobe(struct trace_uprobe *tu) +{ +	int ret; + +	ret = unregister_uprobe_event(tu); +	if (ret) +		return ret; + +	list_del(&tu->list); +	free_trace_uprobe(tu); +	return 0; +} + +/* Register a trace_uprobe and probe_event */ +static int register_trace_uprobe(struct trace_uprobe *tu) +{ +	struct trace_uprobe *old_tu; +	int ret; + +	mutex_lock(&uprobe_lock); + +	/* register as an event */ +	old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), +			tu->tp.call.class->system); +	if (old_tu) { +		/* delete old event */ +		ret = unregister_trace_uprobe(old_tu); +		if (ret) +			goto end; +	} + +	ret = register_uprobe_event(tu); +	if (ret) { +		pr_warning("Failed to register probe event(%d)\n", ret); +		goto end; +	} + +	list_add_tail(&tu->list, &uprobe_list); + +end: +	mutex_unlock(&uprobe_lock); + +	return ret; +} + +/* + * Argument syntax: + *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] + * + *  - Remove uprobe: -:[GRP/]EVENT + */ +static int create_trace_uprobe(int argc, char **argv) +{ +	struct trace_uprobe *tu; +	struct inode *inode; +	char *arg, *event, *group, *filename; +	char buf[MAX_EVENT_NAME_LEN]; +	struct path path; +	unsigned long offset; +	bool is_delete, is_return; +	int i, ret; + +	inode = NULL; +	ret = 0; +	is_delete = false; +	is_return = false; +	event = NULL; +	group = NULL; + +	/* argc must be >= 1 */ +	if (argv[0][0] == '-') +		is_delete = true; +	else if (argv[0][0] == 'r') +		is_return = true; +	else if (argv[0][0] != 'p') { +		pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); +		return -EINVAL; +	} + +	if (argv[0][1] == ':') { +		event = &argv[0][2]; +		arg = strchr(event, '/'); + +		if (arg) { +			group = event; +			event = arg + 1; +			event[-1] = '\0'; + +			if (strlen(group) == 0) { +				pr_info("Group name is not specified\n"); +				return -EINVAL; +			} +		} +		if (strlen(event) == 0) { +			pr_info("Event name is not specified\n"); +			return -EINVAL; +		} +	} +	if (!group) +		group = UPROBE_EVENT_SYSTEM; + +	if (is_delete) { +		int ret; + +		if (!event) { +			pr_info("Delete command needs an event name.\n"); +			return -EINVAL; +		} +		mutex_lock(&uprobe_lock); +		tu = find_probe_event(event, group); + +		if (!tu) { +			mutex_unlock(&uprobe_lock); +			pr_info("Event %s/%s doesn't exist.\n", group, event); +			return -ENOENT; +		} +		/* delete an event */ +		ret = unregister_trace_uprobe(tu); +		mutex_unlock(&uprobe_lock); +		return ret; +	} + +	if (argc < 2) { +		pr_info("Probe point is not specified.\n"); +		return -EINVAL; +	} +	if (isdigit(argv[1][0])) { +		pr_info("probe point must be have a filename.\n"); +		return -EINVAL; +	} +	arg = strchr(argv[1], ':'); +	if (!arg) { +		ret = -EINVAL; +		goto fail_address_parse; +	} + +	*arg++ = '\0'; +	filename = argv[1]; +	ret = kern_path(filename, LOOKUP_FOLLOW, &path); +	if (ret) +		goto fail_address_parse; + +	inode = igrab(path.dentry->d_inode); +	path_put(&path); + +	if (!inode || !S_ISREG(inode->i_mode)) { +		ret = -EINVAL; +		goto fail_address_parse; +	} + +	ret = kstrtoul(arg, 0, &offset); +	if (ret) +		goto fail_address_parse; + +	argc -= 2; +	argv += 2; + +	/* setup a probe */ +	if (!event) { +		char *tail; +		char *ptr; + +		tail = kstrdup(kbasename(filename), GFP_KERNEL); +		if (!tail) { +			ret = -ENOMEM; +			goto fail_address_parse; +		} + +		ptr = strpbrk(tail, ".-_"); +		if (ptr) +			*ptr = '\0'; + +		snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); +		event = buf; +		kfree(tail); +	} + +	tu = alloc_trace_uprobe(group, event, argc, is_return); +	if (IS_ERR(tu)) { +		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); +		ret = PTR_ERR(tu); +		goto fail_address_parse; +	} +	tu->offset = offset; +	tu->inode = inode; +	tu->filename = kstrdup(filename, GFP_KERNEL); + +	if (!tu->filename) { +		pr_info("Failed to allocate filename.\n"); +		ret = -ENOMEM; +		goto error; +	} + +	/* parse arguments */ +	ret = 0; +	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { +		struct probe_arg *parg = &tu->tp.args[i]; + +		/* Increment count for freeing args in error case */ +		tu->tp.nr_args++; + +		/* Parse argument name */ +		arg = strchr(argv[i], '='); +		if (arg) { +			*arg++ = '\0'; +			parg->name = kstrdup(argv[i], GFP_KERNEL); +		} else { +			arg = argv[i]; +			/* If argument name is omitted, set "argN" */ +			snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); +			parg->name = kstrdup(buf, GFP_KERNEL); +		} + +		if (!parg->name) { +			pr_info("Failed to allocate argument[%d] name.\n", i); +			ret = -ENOMEM; +			goto error; +		} + +		if (!is_good_name(parg->name)) { +			pr_info("Invalid argument[%d] name: %s\n", i, parg->name); +			ret = -EINVAL; +			goto error; +		} + +		if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { +			pr_info("Argument[%d] name '%s' conflicts with " +				"another field.\n", i, argv[i]); +			ret = -EINVAL; +			goto error; +		} + +		/* Parse fetch argument */ +		ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, +						 is_return, false); +		if (ret) { +			pr_info("Parse error at argument[%d]. (%d)\n", i, ret); +			goto error; +		} +	} + +	ret = register_trace_uprobe(tu); +	if (ret) +		goto error; +	return 0; + +error: +	free_trace_uprobe(tu); +	return ret; + +fail_address_parse: +	if (inode) +		iput(inode); + +	pr_info("Failed to parse address or file.\n"); + +	return ret; +} + +static int cleanup_all_probes(void) +{ +	struct trace_uprobe *tu; +	int ret = 0; + +	mutex_lock(&uprobe_lock); +	while (!list_empty(&uprobe_list)) { +		tu = list_entry(uprobe_list.next, struct trace_uprobe, list); +		ret = unregister_trace_uprobe(tu); +		if (ret) +			break; +	} +	mutex_unlock(&uprobe_lock); +	return ret; +} + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ +	mutex_lock(&uprobe_lock); +	return seq_list_start(&uprobe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ +	return seq_list_next(v, &uprobe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ +	mutex_unlock(&uprobe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ +	struct trace_uprobe *tu = v; +	char c = is_ret_probe(tu) ? 'r' : 'p'; +	int i; + +	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, +			ftrace_event_name(&tu->tp.call)); +	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + +	for (i = 0; i < tu->tp.nr_args; i++) +		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); + +	seq_printf(m, "\n"); +	return 0; +} + +static const struct seq_operations probes_seq_op = { +	.start	= probes_seq_start, +	.next	= probes_seq_next, +	.stop	= probes_seq_stop, +	.show	= probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ +	int ret; + +	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { +		ret = cleanup_all_probes(); +		if (ret) +			return ret; +	} + +	return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, +			    size_t count, loff_t *ppos) +{ +	return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); +} + +static const struct file_operations uprobe_events_ops = { +	.owner		= THIS_MODULE, +	.open		= probes_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +	.write		= probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ +	struct trace_uprobe *tu = v; + +	seq_printf(m, "  %s %-44s %15lu\n", tu->filename, +			ftrace_event_name(&tu->tp.call), tu->nhit); +	return 0; +} + +static const struct seq_operations profile_seq_op = { +	.start	= probes_seq_start, +	.next	= probes_seq_next, +	.stop	= probes_seq_stop, +	.show	= probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ +	return seq_open(file, &profile_seq_op); +} + +static const struct file_operations uprobe_profile_ops = { +	.owner		= THIS_MODULE, +	.open		= profile_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release, +}; + +struct uprobe_cpu_buffer { +	struct mutex mutex; +	void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ +	int cpu, err_cpu; + +	uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); +	if (uprobe_cpu_buffer == NULL) +		return -ENOMEM; + +	for_each_possible_cpu(cpu) { +		struct page *p = alloc_pages_node(cpu_to_node(cpu), +						  GFP_KERNEL, 0); +		if (p == NULL) { +			err_cpu = cpu; +			goto err; +		} +		per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); +		mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); +	} + +	return 0; + +err: +	for_each_possible_cpu(cpu) { +		if (cpu == err_cpu) +			break; +		free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); +	} + +	free_percpu(uprobe_cpu_buffer); +	return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ +	int ret = 0; + +	BUG_ON(!mutex_is_locked(&event_mutex)); + +	if (uprobe_buffer_refcnt++ == 0) { +		ret = uprobe_buffer_init(); +		if (ret < 0) +			uprobe_buffer_refcnt--; +	} + +	return ret; +} + +static void uprobe_buffer_disable(void) +{ +	int cpu; + +	BUG_ON(!mutex_is_locked(&event_mutex)); + +	if (--uprobe_buffer_refcnt == 0) { +		for_each_possible_cpu(cpu) +			free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, +							     cpu)->buf); + +		free_percpu(uprobe_cpu_buffer); +		uprobe_cpu_buffer = NULL; +	} +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ +	struct uprobe_cpu_buffer *ucb; +	int cpu; + +	cpu = raw_smp_processor_id(); +	ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + +	/* +	 * Use per-cpu buffers for fastest access, but we might migrate +	 * so the mutex makes sure we have sole access to it. +	 */ +	mutex_lock(&ucb->mutex); + +	return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ +	mutex_unlock(&ucb->mutex); +} + +static void __uprobe_trace_func(struct trace_uprobe *tu, +				unsigned long func, struct pt_regs *regs, +				struct uprobe_cpu_buffer *ucb, int dsize, +				struct ftrace_event_file *ftrace_file) +{ +	struct uprobe_trace_entry_head *entry; +	struct ring_buffer_event *event; +	struct ring_buffer *buffer; +	void *data; +	int size, esize; +	struct ftrace_event_call *call = &tu->tp.call; + +	WARN_ON(call != ftrace_file->event_call); + +	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) +		return; + +	if (ftrace_trigger_soft_disabled(ftrace_file)) +		return; + +	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); +	size = esize + tu->tp.size + dsize; +	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, +						call->event.type, size, 0, 0); +	if (!event) +		return; + +	entry = ring_buffer_event_data(event); +	if (is_ret_probe(tu)) { +		entry->vaddr[0] = func; +		entry->vaddr[1] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		entry->vaddr[0] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, false); +	} + +	memcpy(data, ucb->buf, tu->tp.size + dsize); + +	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); +} + +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, +			     struct uprobe_cpu_buffer *ucb, int dsize) +{ +	struct event_file_link *link; + +	if (is_ret_probe(tu)) +		return 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(link, &tu->tp.files, list) +		__uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); +	rcu_read_unlock(); + +	return 0; +} + +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, +				 struct pt_regs *regs, +				 struct uprobe_cpu_buffer *ucb, int dsize) +{ +	struct event_file_link *link; + +	rcu_read_lock(); +	list_for_each_entry_rcu(link, &tu->tp.files, list) +		__uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); +	rcu_read_unlock(); +} + +/* Event entry printers */ +static enum print_line_t +print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) +{ +	struct uprobe_trace_entry_head *entry; +	struct trace_seq *s = &iter->seq; +	struct trace_uprobe *tu; +	u8 *data; +	int i; + +	entry = (struct uprobe_trace_entry_head *)iter->ent; +	tu = container_of(event, struct trace_uprobe, tp.call.event); + +	if (is_ret_probe(tu)) { +		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", +					ftrace_event_name(&tu->tp.call), +					entry->vaddr[1], entry->vaddr[0])) +			goto partial; +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		if (!trace_seq_printf(s, "%s: (0x%lx)", +					ftrace_event_name(&tu->tp.call), +					entry->vaddr[0])) +			goto partial; +		data = DATAOF_TRACE_ENTRY(entry, false); +	} + +	for (i = 0; i < tu->tp.nr_args; i++) { +		struct probe_arg *parg = &tu->tp.args[i]; + +		if (!parg->type->print(s, parg->name, data + parg->offset, entry)) +			goto partial; +	} + +	if (trace_seq_puts(s, "\n")) +		return TRACE_TYPE_HANDLED; + +partial: +	return TRACE_TYPE_PARTIAL_LINE; +} + +typedef bool (*filter_func_t)(struct uprobe_consumer *self, +				enum uprobe_filter_ctx ctx, +				struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, +		   filter_func_t filter) +{ +	bool enabled = trace_probe_is_enabled(&tu->tp); +	struct event_file_link *link = NULL; +	int ret; + +	if (file) { +		if (tu->tp.flags & TP_FLAG_PROFILE) +			return -EINTR; + +		link = kmalloc(sizeof(*link), GFP_KERNEL); +		if (!link) +			return -ENOMEM; + +		link->file = file; +		list_add_tail_rcu(&link->list, &tu->tp.files); + +		tu->tp.flags |= TP_FLAG_TRACE; +	} else { +		if (tu->tp.flags & TP_FLAG_TRACE) +			return -EINTR; + +		tu->tp.flags |= TP_FLAG_PROFILE; +	} + +	WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + +	if (enabled) +		return 0; + +	ret = uprobe_buffer_enable(); +	if (ret) +		goto err_flags; + +	tu->consumer.filter = filter; +	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); +	if (ret) +		goto err_buffer; + +	return 0; + + err_buffer: +	uprobe_buffer_disable(); + + err_flags: +	if (file) { +		list_del(&link->list); +		kfree(link); +		tu->tp.flags &= ~TP_FLAG_TRACE; +	} else { +		tu->tp.flags &= ~TP_FLAG_PROFILE; +	} +	return ret; +} + +static void +probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) +{ +	if (!trace_probe_is_enabled(&tu->tp)) +		return; + +	if (file) { +		struct event_file_link *link; + +		link = find_event_file_link(&tu->tp, file); +		if (!link) +			return; + +		list_del_rcu(&link->list); +		/* synchronize with u{,ret}probe_trace_func */ +		synchronize_sched(); +		kfree(link); + +		if (!list_empty(&tu->tp.files)) +			return; +	} + +	WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + +	uprobe_unregister(tu->inode, tu->offset, &tu->consumer); +	tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; + +	uprobe_buffer_disable(); +} + +static int uprobe_event_define_fields(struct ftrace_event_call *event_call) +{ +	int ret, i, size; +	struct uprobe_trace_entry_head field; +	struct trace_uprobe *tu = event_call->data; + +	if (is_ret_probe(tu)) { +		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); +		DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); +		size = SIZEOF_TRACE_ENTRY(true); +	} else { +		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); +		size = SIZEOF_TRACE_ENTRY(false); +	} +	/* Set argument names as fields */ +	for (i = 0; i < tu->tp.nr_args; i++) { +		struct probe_arg *parg = &tu->tp.args[i]; + +		ret = trace_define_field(event_call, parg->type->fmttype, +					 parg->name, size + parg->offset, +					 parg->type->size, parg->type->is_signed, +					 FILTER_OTHER); + +		if (ret) +			return ret; +	} +	return 0; +} + +#ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ +	struct perf_event *event; + +	if (filter->nr_systemwide) +		return true; + +	list_for_each_entry(event, &filter->perf_events, hw.tp_list) { +		if (event->hw.tp_target->mm == mm) +			return true; +	} + +	return false; +} + +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ +	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ +	bool done; + +	write_lock(&tu->filter.rwlock); +	if (event->hw.tp_target) { +		list_del(&event->hw.tp_list); +		done = tu->filter.nr_systemwide || +			(event->hw.tp_target->flags & PF_EXITING) || +			uprobe_filter_event(tu, event); +	} else { +		tu->filter.nr_systemwide--; +		done = tu->filter.nr_systemwide; +	} +	write_unlock(&tu->filter.rwlock); + +	if (!done) +		return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + +	return 0; +} + +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ +	bool done; +	int err; + +	write_lock(&tu->filter.rwlock); +	if (event->hw.tp_target) { +		/* +		 * event->parent != NULL means copy_process(), we can avoid +		 * uprobe_apply(). current->mm must be probed and we can rely +		 * on dup_mmap() which preserves the already installed bp's. +		 * +		 * attr.enable_on_exec means that exec/mmap will install the +		 * breakpoints we need. +		 */ +		done = tu->filter.nr_systemwide || +			event->parent || event->attr.enable_on_exec || +			uprobe_filter_event(tu, event); +		list_add(&event->hw.tp_list, &tu->filter.perf_events); +	} else { +		done = tu->filter.nr_systemwide; +		tu->filter.nr_systemwide++; +	} +	write_unlock(&tu->filter.rwlock); + +	err = 0; +	if (!done) { +		err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); +		if (err) +			uprobe_perf_close(tu, event); +	} +	return err; +} + +static bool uprobe_perf_filter(struct uprobe_consumer *uc, +				enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ +	struct trace_uprobe *tu; +	int ret; + +	tu = container_of(uc, struct trace_uprobe, consumer); +	read_lock(&tu->filter.rwlock); +	ret = __uprobe_perf_filter(&tu->filter, mm); +	read_unlock(&tu->filter.rwlock); + +	return ret; +} + +static void __uprobe_perf_func(struct trace_uprobe *tu, +			       unsigned long func, struct pt_regs *regs, +			       struct uprobe_cpu_buffer *ucb, int dsize) +{ +	struct ftrace_event_call *call = &tu->tp.call; +	struct uprobe_trace_entry_head *entry; +	struct hlist_head *head; +	void *data; +	int size, esize; +	int rctx; + +	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + +	size = esize + tu->tp.size + dsize; +	size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); +	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) +		return; + +	preempt_disable(); +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		goto out; + +	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); +	if (!entry) +		goto out; + +	if (is_ret_probe(tu)) { +		entry->vaddr[0] = func; +		entry->vaddr[1] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, true); +	} else { +		entry->vaddr[0] = instruction_pointer(regs); +		data = DATAOF_TRACE_ENTRY(entry, false); +	} + +	memcpy(data, ucb->buf, tu->tp.size + dsize); + +	if (size - esize > tu->tp.size + dsize) { +		int len = tu->tp.size + dsize; + +		memset(data + len, 0, size - esize - len); +	} + +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + out: +	preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, +			    struct uprobe_cpu_buffer *ucb, int dsize) +{ +	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) +		return UPROBE_HANDLER_REMOVE; + +	if (!is_ret_probe(tu)) +		__uprobe_perf_func(tu, 0, regs, ucb, dsize); +	return 0; +} + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, +				struct pt_regs *regs, +				struct uprobe_cpu_buffer *ucb, int dsize) +{ +	__uprobe_perf_func(tu, func, regs, ucb, dsize); +} +#endif	/* CONFIG_PERF_EVENTS */ + +static int +trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, +		      void *data) +{ +	struct trace_uprobe *tu = event->data; +	struct ftrace_event_file *file = data; + +	switch (type) { +	case TRACE_REG_REGISTER: +		return probe_event_enable(tu, file, NULL); + +	case TRACE_REG_UNREGISTER: +		probe_event_disable(tu, file); +		return 0; + +#ifdef CONFIG_PERF_EVENTS +	case TRACE_REG_PERF_REGISTER: +		return probe_event_enable(tu, NULL, uprobe_perf_filter); + +	case TRACE_REG_PERF_UNREGISTER: +		probe_event_disable(tu, NULL); +		return 0; + +	case TRACE_REG_PERF_OPEN: +		return uprobe_perf_open(tu, data); + +	case TRACE_REG_PERF_CLOSE: +		return uprobe_perf_close(tu, data); + +#endif +	default: +		return 0; +	} +	return 0; +} + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +{ +	struct trace_uprobe *tu; +	struct uprobe_dispatch_data udd; +	struct uprobe_cpu_buffer *ucb; +	int dsize, esize; +	int ret = 0; + + +	tu = container_of(con, struct trace_uprobe, consumer); +	tu->nhit++; + +	udd.tu = tu; +	udd.bp_addr = instruction_pointer(regs); + +	current->utask->vaddr = (unsigned long) &udd; + +	if (WARN_ON_ONCE(!uprobe_cpu_buffer)) +		return 0; + +	dsize = __get_data_size(&tu->tp, regs); +	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + +	ucb = uprobe_buffer_get(); +	store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + +	if (tu->tp.flags & TP_FLAG_TRACE) +		ret |= uprobe_trace_func(tu, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS +	if (tu->tp.flags & TP_FLAG_PROFILE) +		ret |= uprobe_perf_func(tu, regs, ucb, dsize); +#endif +	uprobe_buffer_put(ucb); +	return ret; +} + +static int uretprobe_dispatcher(struct uprobe_consumer *con, +				unsigned long func, struct pt_regs *regs) +{ +	struct trace_uprobe *tu; +	struct uprobe_dispatch_data udd; +	struct uprobe_cpu_buffer *ucb; +	int dsize, esize; + +	tu = container_of(con, struct trace_uprobe, consumer); + +	udd.tu = tu; +	udd.bp_addr = func; + +	current->utask->vaddr = (unsigned long) &udd; + +	if (WARN_ON_ONCE(!uprobe_cpu_buffer)) +		return 0; + +	dsize = __get_data_size(&tu->tp, regs); +	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + +	ucb = uprobe_buffer_get(); +	store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + +	if (tu->tp.flags & TP_FLAG_TRACE) +		uretprobe_trace_func(tu, func, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS +	if (tu->tp.flags & TP_FLAG_PROFILE) +		uretprobe_perf_func(tu, func, regs, ucb, dsize); +#endif +	uprobe_buffer_put(ucb); +	return 0; +} + +static struct trace_event_functions uprobe_funcs = { +	.trace		= print_uprobe_event +}; + +static int register_uprobe_event(struct trace_uprobe *tu) +{ +	struct ftrace_event_call *call = &tu->tp.call; +	int ret; + +	/* Initialize ftrace_event_call */ +	INIT_LIST_HEAD(&call->class->fields); +	call->event.funcs = &uprobe_funcs; +	call->class->define_fields = uprobe_event_define_fields; + +	if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) +		return -ENOMEM; + +	ret = register_ftrace_event(&call->event); +	if (!ret) { +		kfree(call->print_fmt); +		return -ENODEV; +	} +	call->flags = 0; +	call->class->reg = trace_uprobe_register; +	call->data = tu; +	ret = trace_add_event_call(call); + +	if (ret) { +		pr_info("Failed to register uprobe event: %s\n", +			ftrace_event_name(call)); +		kfree(call->print_fmt); +		unregister_ftrace_event(&call->event); +	} + +	return ret; +} + +static int unregister_uprobe_event(struct trace_uprobe *tu) +{ +	int ret; + +	/* tu->event is unregistered in trace_remove_event_call() */ +	ret = trace_remove_event_call(&tu->tp.call); +	if (ret) +		return ret; +	kfree(tu->tp.call.print_fmt); +	tu->tp.call.print_fmt = NULL; +	return 0; +} + +/* Make a trace interface for controling probe points */ +static __init int init_uprobe_trace(void) +{ +	struct dentry *d_tracer; + +	d_tracer = tracing_init_dentry(); +	if (!d_tracer) +		return 0; + +	trace_create_file("uprobe_events", 0644, d_tracer, +				    NULL, &uprobe_events_ops); +	/* Profile interface */ +	trace_create_file("uprobe_profile", 0444, d_tracer, +				    NULL, &uprobe_profile_ops); +	return 0; +} + +fs_initcall(init_uprobe_trace); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a472..00000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Workqueue statistical tracer. - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - - -#include <trace/events/workqueue.h> -#include <linux/list.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/kref.h> -#include "trace_stat.h" -#include "trace.h" - - -/* A cpu workqueue thread */ -struct cpu_workqueue_stats { -	struct list_head            list; -	struct kref                 kref; -	int		            cpu; -	pid_t			    pid; -/* Can be inserted from interrupt or user context, need to be atomic */ -	atomic_t	            inserted; -/* - *  Don't need to be atomic, works are serialized in a single workqueue thread - *  on a single CPU. - */ -	unsigned int		    executed; -}; - -/* List of workqueue threads on one cpu */ -struct workqueue_global_stats { -	struct list_head	list; -	spinlock_t		lock; -}; - -/* Don't need a global lock because allocated before the workqueues, and - * never freed. - */ -static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); -#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) - -static void cpu_workqueue_stat_free(struct kref *kref) -{ -	kfree(container_of(kref, struct cpu_workqueue_stats, kref)); -} - -/* Insertion of a work */ -static void -probe_workqueue_insertion(void *ignore, -			  struct task_struct *wq_thread, -			  struct work_struct *work) -{ -	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { -		if (node->pid == wq_thread->pid) { -			atomic_inc(&node->inserted); -			goto found; -		} -	} -	pr_debug("trace_workqueue: entry not found\n"); -found: -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Execution of a work */ -static void -probe_workqueue_execution(void *ignore, -			  struct task_struct *wq_thread, -			  struct work_struct *work) -{ -	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { -		if (node->pid == wq_thread->pid) { -			node->executed++; -			goto found; -		} -	} -	pr_debug("trace_workqueue: entry not found\n"); -found: -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(void *ignore, -				     struct task_struct *wq_thread, int cpu) -{ -	struct cpu_workqueue_stats *cws; -	unsigned long flags; - -	WARN_ON(cpu < 0); - -	/* Workqueues are sometimes created in atomic context */ -	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); -	if (!cws) { -		pr_warning("trace_workqueue: not enough memory\n"); -		return; -	} -	INIT_LIST_HEAD(&cws->list); -	kref_init(&cws->kref); -	cws->cpu = cpu; -	cws->pid = wq_thread->pid; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Destruction of a cpu workqueue thread */ -static void -probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) -{ -	/* Workqueue only execute on one cpu */ -	int cpu = cpumask_first(&wq_thread->cpus_allowed); -	struct cpu_workqueue_stats *node, *next; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, -							list) { -		if (node->pid == wq_thread->pid) { -			list_del(&node->list); -			kref_put(&node->kref, cpu_workqueue_stat_free); -			goto found; -		} -	} - -	pr_debug("trace_workqueue: don't find workqueue to destroy\n"); -found: -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -} - -static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) -{ -	unsigned long flags; -	struct cpu_workqueue_stats *ret = NULL; - - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - -	if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { -		ret = list_entry(workqueue_cpu_stat(cpu)->list.next, -				 struct cpu_workqueue_stats, list); -		kref_get(&ret->kref); -	} - -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -	return ret; -} - -static void *workqueue_stat_start(struct tracer_stat *trace) -{ -	int cpu; -	void *ret = NULL; - -	for_each_possible_cpu(cpu) { -		ret = workqueue_stat_start_cpu(cpu); -		if (ret) -			return ret; -	} -	return NULL; -} - -static void *workqueue_stat_next(void *prev, int idx) -{ -	struct cpu_workqueue_stats *prev_cws = prev; -	struct cpu_workqueue_stats *ret; -	int cpu = prev_cws->cpu; -	unsigned long flags; - -	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); -	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { -		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -		do { -			cpu = cpumask_next(cpu, cpu_possible_mask); -			if (cpu >= nr_cpu_ids) -				return NULL; -		} while (!(ret = workqueue_stat_start_cpu(cpu))); -		return ret; -	} else { -		ret = list_entry(prev_cws->list.next, -				 struct cpu_workqueue_stats, list); -		kref_get(&ret->kref); -	} -	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -	return ret; -} - -static int workqueue_stat_show(struct seq_file *s, void *p) -{ -	struct cpu_workqueue_stats *cws = p; -	struct pid *pid; -	struct task_struct *tsk; - -	pid = find_get_pid(cws->pid); -	if (pid) { -		tsk = get_pid_task(pid, PIDTYPE_PID); -		if (tsk) { -			seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu, -				   atomic_read(&cws->inserted), cws->executed, -				   tsk->comm); -			put_task_struct(tsk); -		} -		put_pid(pid); -	} - -	return 0; -} - -static void workqueue_stat_release(void *stat) -{ -	struct cpu_workqueue_stats *node = stat; - -	kref_put(&node->kref, cpu_workqueue_stat_free); -} - -static int workqueue_stat_headers(struct seq_file *s) -{ -	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n"); -	seq_printf(s, "# |      |         |          |\n"); -	return 0; -} - -struct tracer_stat workqueue_stats __read_mostly = { -	.name = "workqueues", -	.stat_start = workqueue_stat_start, -	.stat_next = workqueue_stat_next, -	.stat_show = workqueue_stat_show, -	.stat_release = workqueue_stat_release, -	.stat_headers = workqueue_stat_headers -}; - - -int __init stat_workqueue_init(void) -{ -	if (register_stat_tracer(&workqueue_stats)) { -		pr_warning("Unable to register workqueue stat tracer\n"); -		return 1; -	} - -	return 0; -} -fs_initcall(stat_workqueue_init); - -/* - * Workqueues are created very early, just after pre-smp initcalls. - * So we must register our tracepoints at this stage. - */ -int __init trace_workqueue_early_init(void) -{ -	int ret, cpu; - -	for_each_possible_cpu(cpu) { -		spin_lock_init(&workqueue_cpu_stat(cpu)->lock); -		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); -	} - -	ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -	if (ret) -		goto out; - -	ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); -	if (ret) -		goto no_insertion; - -	ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); -	if (ret) -		goto no_execution; - -	ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); -	if (ret) -		goto no_creation; - -	return 0; - -no_creation: -	unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); -no_execution: -	unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); -no_insertion: -	unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -out: -	pr_warning("trace_workqueue: unable to trace workqueues\n"); - -	return 1; -} -early_initcall(trace_workqueue_early_init);  | 
