diff options
Diffstat (limited to 'kernel/trace')
39 files changed, 14911 insertions, 5009 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cd3134510f3..d4409356f40 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE help See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -49,6 +52,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + config HAVE_C_RECORDMCOUNT bool help @@ -57,8 +65,13 @@ config HAVE_C_RECORDMCOUNT config TRACER_MAX_TRACE bool +config TRACE_CLOCK + bool + config RING_BUFFER bool + select TRACE_CLOCK + select IRQ_WORK config FTRACE_NMI_ENTER bool @@ -69,21 +82,6 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool -config EVENT_POWER_TRACING_DEPRECATED - depends on EVENT_TRACING - bool "Deprecated power event trace API, to be removed" - default y - help - Provides old power event types: - C-state/idle accounting events: - power:power_start - power:power_end - and old cpufreq accounting event: - power:power_frequency - This is for userspace compatibility - and will vanish after 5 kernel iterations, - namely 3.1. - config CONTEXT_SWITCH_TRACER bool @@ -109,6 +107,7 @@ config TRACING select NOP_TRACER select BINARY_PRINTF select EVENT_TRACING + select TRACE_CLOCK config GENERIC_TRACER bool @@ -141,7 +140,6 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER @@ -178,6 +176,8 @@ config IRQSOFF_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -200,6 +200,8 @@ config PREEMPT_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -219,6 +221,7 @@ config SCHED_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE + select TRACER_SNAPSHOT help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. @@ -240,6 +243,37 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/debug/tracing/snapshot + cat snapshot + +config TRACER_SNAPSHOT_PER_CPU_SWAP + bool "Allow snapshot to swap per CPU" + depends on TRACER_SNAPSHOT + select RING_BUFFER_ALLOW_SWAP + help + Allow doing a snapshot of a single CPU buffer instead of a + full swap (all buffers). If this is set, then the following is + allowed: + + echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + + After which, only the tracing buffer for CPU 2 was swapped with + the main tracing buffer, and the other CPU buffers remain the same. + + When this is enabled, this adds a little more overhead to the + trace recording, as it needs to add some checks to synchronize + recording with swaps. But this does not affect the performance + of the overall system. This is enabled by default when the preempt + or irq latency tracers are enabled, as those need to swap as well + and already adds the overhead (plus a lot more). + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -272,7 +306,7 @@ config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" select TRACE_BRANCH_PROFILING help - This tracer profiles all the the likely and unlikely macros + This tracer profiles all likely and unlikely macros in the kernel. It will display the results in: /sys/kernel/debug/tracing/trace_stat/branch_annotated @@ -373,6 +407,7 @@ config KPROBE_EVENT depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" select TRACING + select PROBE_EVENTS default y help This allows the user to add tracing events (similar to tracepoints) @@ -385,24 +420,53 @@ config KPROBE_EVENT This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. +config UPROBE_EVENT + bool "Enable uprobes-based dynamic events" + depends on ARCH_SUPPORTS_UPROBES + depends on MMU + depends on PERF_EVENTS + select UPROBES + select PROBE_EVENTS + select TRACING + default n + help + This allows the user to add tracing events on top of userspace + dynamic events (similar to tracepoints) on the fly via the trace + events interface. Those events can be inserted wherever uprobes + can probe, and record various registers. + This option is required if you plan to use perf-probe subcommand + of perf tools on user space applications. + +config PROBE_EVENTS + def_bool n + config DYNAMIC_FTRACE - bool "enable/disable ftrace tracepoints dynamically" + bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replace them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS config FUNCTION_PROFILER bool "Kernel function profiler" @@ -471,6 +535,36 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_sched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER @@ -487,6 +581,29 @@ config RING_BUFFER_BENCHMARK If unsure, say N. +config RING_BUFFER_STARTUP_TEST + bool "Ring buffer startup self test" + depends on RING_BUFFER + help + Run a simple self test on the ring buffer on boot up. Late in the + kernel boot sequence, the test will start that kicks off + a thread per cpu. Each thread will write various size events + into the ring buffer. Another thread is created to send IPIs + to each of the threads, where the IPI handler will also write + to the ring buffer, to test/stress the nesting ability. + If any anomalies are discovered, a warning will be displayed + and all ring buffers will be disabled. + + The test runs for 10 seconds. This will slow your boot time + by at least 10 more seconds. + + At the end of the test, statics and more checks are done. + It will output the stats of each per cpu buffer. What + was written, the sizes, what was read, what was lost, and + other similar details. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5e..2611613f14f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -5,23 +5,22 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +endif # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) -# -# Make the trace clocks available generally: it's infrastructure -# relied on by ptrace for example: -# -obj-y += trace_clock.o +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o @@ -41,7 +40,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o @@ -53,6 +51,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM_RUNTIME),y) @@ -61,5 +60,9 @@ endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o + +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cdea7b56b0c..c1bd4ada2a0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> +#include <linux/list.h> #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -72,7 +76,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, @@ -107,10 +111,18 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk) { + unsigned long flags; + struct blk_trace *bt; + tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -147,7 +159,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) return; local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); @@ -218,7 +230,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -311,13 +322,6 @@ int blk_trace_remove(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_trace_remove); -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { @@ -331,18 +335,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, - .open = blk_dropped_open, + .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { @@ -371,7 +368,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, - .open = blk_msg_open, + .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; @@ -491,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -581,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; - memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, 32)) { + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } @@ -615,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -622,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -699,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q) * blk_add_trace_rq - Add a trace for a request oriented action * @q: queue the io is for * @rq: the source request + * @nr_bytes: number of completed bytes * @what: the action * * Description: @@ -706,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + unsigned int nr_bytes, u32 what) { struct blk_trace *bt = q->blk_trace; @@ -715,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -727,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, static void blk_add_trace_rq_abort(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ABORT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_INSERT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); } static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, - struct request *rq) + struct request *rq, + unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); + blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); } /** @@ -778,8 +783,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (!error && !bio_flagged(bio, BIO_UPTODATE)) error = EIO; - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - error, 0, NULL); + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -797,6 +802,7 @@ static void blk_add_trace_bio_complete(void *ignore, static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); @@ -804,6 +810,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, + struct request *rq, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); @@ -880,8 +887,9 @@ static void blk_add_trace_split(void *ignore, if (bt) { __be64 rpdu = cpu_to_be64(pdu); - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + __blk_add_trace(bt, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, + !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); } } @@ -913,9 +921,9 @@ static void blk_add_trace_bio_remap(void *ignore, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), - sizeof(r), &r); + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); } /** @@ -1421,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } -static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { @@ -1484,6 +1493,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); + spin_lock_irq(&running_trace_lock); + list_del(&bt->running_list); + spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } @@ -1820,6 +1832,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } +EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 683d559a0ee..ac9d1dad630 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -10,7 +10,7 @@ * Based on code in the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/stop_machine.h> @@ -62,12 +62,31 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) + +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_REGEX_LOCK(opsname) \ + .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#else +#define INIT_REGEX_LOCK(opsname) +#endif + +static struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, +}; + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* Quick disabling of function tracer. */ -int function_trace_stop; +int function_trace_stop __read_mostly; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -84,53 +103,80 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, -}; - -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list + * can use rcu_dereference_raw_notrace() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_global_list_func(unsigned long ip, - unsigned long parent_ip) +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw_notrace(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + +static inline void ftrace_ops_init(struct ftrace_ops *ops) { - struct ftrace_ops *op; +#ifdef CONFIG_DYNAMIC_FTRACE + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { + mutex_init(&ops->regex_lock); + ops->flags |= FTRACE_OPS_FL_INITIALIZED; + } +#endif +} - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) - return; +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void) +{ + struct ftrace_ops *ops; + int cnt = 0; + + mutex_lock(&ftrace_lock); - trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { - op->func(ip, parent_ip); - op = rcu_dereference_raw(op->next); /*see above*/ - }; - trace_recursion_clear(TRACE_GLOBAL_BIT); + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) + cnt++; + + mutex_unlock(&ftrace_lock); + + return cnt; } -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip); + ftrace_pid_function(ip, parent_ip, op, regs); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -149,78 +195,127 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - __ftrace_trace_function = ftrace_stub; - __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +static void control_ops_disable_all(struct ftrace_ops *ops) { - if (function_trace_stop) - return; + int cpu; - __ftrace_trace_function(ip, parent_ip); + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; } -#endif -static void update_global_ops(void) +static int control_ops_alloc(struct ftrace_ops *ops) { - ftrace_func_t func; + int __percpu *disabled; + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void ftrace_sync(struct work_struct *work) +{ /* - * If there's only one function registered, then call that - * function directly. Otherwise, we need to iterate over the - * registered callers. + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. */ - if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) - func = ftrace_global_list->func; - else - func = ftrace_global_list_func; - - /* If we filter on pids, update to use the pid function */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } +} - global_ops.func = func; +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif + static void update_ftrace_function(void) { ftrace_func_t func; - update_global_ops(); - /* * If we are at the end of the list and this ops is - * not dynamic, then have the mcount trampoline call - * the function directly + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && + !FTRACE_FORCE_LIST_FUNC)) { + /* Set the ftrace_ops that the arch callback uses */ + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; - else + } else { + /* Just use the default ftrace_ops */ + set_function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; + } + + update_function_graph_func(); + + /* If there's no change, then do nothing more here */ + if (ftrace_trace_function == func) + return; + + /* + * If we are using the list function, it doesn't care + * about the function_trace_ops. + */ + if (func == ftrace_ops_list_func) { + ftrace_trace_function = func; + /* + * Don't even bother setting function_trace_ops, + * it would be racy to do so anyway. + */ + return; + } + +#ifndef CONFIG_DYNAMIC_FTRACE + /* + * For static tracing, we need to be a bit more careful. + * The function change takes affect immediately. Thus, + * we need to coorditate the setting of the function_trace_ops + * with the setting of the ftrace_trace_function. + * + * Set the function to the list ops, which will call the + * function we want, albeit indirectly, but it handles the + * ftrace_ops and doesn't depend on function_trace_op. + */ + ftrace_trace_function = ftrace_ops_list_func; + /* + * Make sure all CPUs see this. Yes this is slow, but static + * tracing is slow and nasty to have enabled. + */ + schedule_on_each_cpu(ftrace_sync); + /* Now all cpus are using the list ops. */ + function_trace_op = set_function_trace_op; + /* Make sure the function_trace_op is visible on all CPUs */ + smp_wmb(); + /* Nasty way to force a rmb on all cpus */ + smp_call_function(ftrace_sync_ipi, NULL, 1); + /* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; -#else -#ifdef CONFIG_DYNAMIC_FTRACE - /* do not update till all functions have been modified */ - __ftrace_trace_function_delay = func; -#else - __ftrace_trace_function = func; -#endif - ftrace_trace_function = ftrace_test_stop_func; -#endif +} + +int using_ftrace_ops_list_func(void) +{ + return ftrace_trace_function == ftrace_ops_list_func; } static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) @@ -259,26 +354,55 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } -static int __register_ftrace_function(struct ftrace_ops *ops) +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) { - if (ftrace_disabled) - return -ENODEV; + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} - if (FTRACE_WARN_ON(ops == &global_ops)) +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); - ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); + if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else add_ftrace_ops(&ftrace_ops_list, ops); @@ -292,21 +416,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; - if (ftrace_disabled) - return -ENODEV; - if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); - if (!ret) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -316,13 +431,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); - /* - * Dynamic ops may be freed, we must make sure that all - * callers are done before leaving this function. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); - return 0; } @@ -366,7 +474,6 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly; static int ftrace_profile_enabled __read_mostly; /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -374,7 +481,8 @@ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) static void * function_stat_next(void *v, int idx) @@ -485,12 +593,18 @@ static int function_stat_show(struct seq_file *m, void *v) if (rec->counter <= 1) stddev = 0; else { - stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + /* * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, (rec->counter - 1) * 1000); + do_div(stddev, rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); @@ -556,7 +670,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); - for (i = 0; i < pages; i++) { + for (i = 1; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); if (!pg->next) goto out_free; @@ -574,7 +688,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) free_page(tmp); } - free_page((unsigned long)stat->pages); stat->pages = NULL; stat->start = NULL; @@ -605,13 +718,6 @@ static int ftrace_profile_init_cpu(int cpu) if (!stat->hash) return -ENOMEM; - if (!ftrace_profile_bits) { - size--; - - for (; size; size >>= 1) - ftrace_profile_bits++; - } - /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); @@ -627,7 +733,7 @@ static int ftrace_profile_init(void) int cpu; int ret = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { ret = ftrace_profile_init_cpu(cpu); if (ret) break; @@ -642,16 +748,15 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; - struct hlist_node *n; unsigned long key; - key = hash_long(ip, ftrace_profile_bits); + key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); hhd = &stat->hash[key]; if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, n, hhd, node) { + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -664,7 +769,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, { unsigned long key; - key = hash_long(rec->ip, ftrace_profile_bits); + key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); hlist_add_head_rcu(&rec->node, &stat->hash[key]); } @@ -705,7 +810,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) } static void -function_profile_call(unsigned long ip, unsigned long parent_ip) +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -716,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -735,7 +841,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0); + function_profile_call(trace->func, 0, NULL, NULL); return 1; } @@ -747,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -795,6 +901,8 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -946,7 +1054,7 @@ struct ftrace_func_probe { unsigned long flags; unsigned long ip; void *data; - struct rcu_head rcu; + struct list_head free_list; }; struct ftrace_func_entry { @@ -977,10 +1085,10 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(global_ops) }; -static DEFINE_MUTEX(ftrace_regex_lock); - struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; @@ -988,8 +1096,6 @@ struct ftrace_page { int size; }; -static struct ftrace_page *ftrace_new_pgs; - #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -999,7 +1105,7 @@ static struct ftrace_page *ftrace_new_pgs; static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool ftrace_hash_empty(struct ftrace_hash *hash) +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !hash->count; } @@ -1010,7 +1116,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - struct hlist_node *n; if (ftrace_hash_empty(hash)) return NULL; @@ -1022,7 +1127,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1079,7 +1184,7 @@ remove_hash_entry(struct ftrace_hash *hash, static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; - struct hlist_node *tp, *tn; + struct hlist_node *tn; struct ftrace_func_entry *entry; int size = 1 << hash->size_bits; int i; @@ -1089,7 +1194,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) for (i = 0; i < size; i++) { hhd = &hash->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + hlist_for_each_entry_safe(entry, tn, hhd, hlist) free_hash_entry(hash, entry); } FTRACE_WARN_ON(hash->count); @@ -1119,6 +1224,13 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); } +void ftrace_free_filter(struct ftrace_ops *ops) +{ + ftrace_ops_init(ops); + free_ftrace_hash(ops->filter_hash); + free_ftrace_hash(ops->notrace_hash); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; @@ -1129,7 +1241,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return NULL; size = 1 << size_bits; - hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); if (!hash->buckets) { kfree(hash); @@ -1146,7 +1258,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; - struct hlist_node *tp; int size; int ret; int i; @@ -1161,7 +1272,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { - hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { ret = add_hash_entry(new_hash, entry->ip); if (ret < 0) goto free_hash; @@ -1187,11 +1298,10 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; - struct hlist_node *tp, *tn; + struct hlist_node *tn; struct hlist_head *hhd; struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; - unsigned long key; int size = src->count; int bits = 0; int ret; @@ -1233,11 +1343,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, size = 1 << src->size_bits; for (i = 0; i < size; i++) { hhd = &src->buckets[i]; - hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { - if (bits > 0) - key = hash_long(entry->ip, bits); - else - key = 0; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) { remove_hash_entry(src, entry); __add_hash_entry(new_hash, entry); } @@ -1272,14 +1378,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, * the hashes are freed with call_rcu_sched(). */ static int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { struct ftrace_hash *filter_hash; struct ftrace_hash *notrace_hash; int ret; - filter_hash = rcu_dereference_raw(ops->filter_hash); - notrace_hash = rcu_dereference_raw(ops->notrace_hash); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * There's a small race when adding ops that the ftrace handler + * that wants regs, may be called without them. We can not + * allow that handler to be called if regs is NULL. + */ + if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) + return 0; +#endif + + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -1309,44 +1425,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static int ftrace_cmp_recs(const void *a, const void *b) { - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; + const struct dyn_ftrace *key = a; + const struct dyn_ftrace *rec = b; - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) + if (key->flags < rec->ip) return -1; + if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) + return 1; return 0; } -/** - * ftrace_location - return true if the ip giving is a traced location - * @ip: the instruction pointer to check - * - * Returns 1 if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. - */ -int ftrace_location(unsigned long ip) +static unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; - key.ip = ip; + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); if (rec) - return 1; + return rec->ip; } return 0; } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns rec->ip if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +unsigned long ftrace_location(unsigned long ip) +{ + return ftrace_location_range(ip, ip); +} + +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(const void *start, const void *end) +{ + unsigned long ret; + + ret = ftrace_location_range((unsigned long)start, + (unsigned long)end); + + return (int)!!ret; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1407,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. */ if (filter_hash && in_hash && !in_other_hash) match = 1; @@ -1422,6 +1574,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags++; if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) return; + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) return; @@ -1446,35 +1604,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -{ - if (ftrace_pages->index == ftrace_pages->size) { - /* We should have allocated enough */ - if (WARN_ON(!ftrace_pages->next)) - return NULL; - ftrace_pages = ftrace_pages->next; - } - - return &ftrace_pages->records[ftrace_pages->index++]; -} - -static struct dyn_ftrace * -ftrace_record_ip(unsigned long ip) -{ - struct dyn_ftrace *rec; - - if (ftrace_disabled) - return NULL; - - rec = ftrace_alloc_dyn_node(ip); - if (!rec) - return NULL; - - rec->ip = ip; - - return rec; -} - static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1524,21 +1653,6 @@ void ftrace_bug(int failed, unsigned long ip) } } - -/* Return 1 if the address range is reserved for ftrace */ -int ftrace_text_reserved(void *start, void *end) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip <= (unsigned long)end && - rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) - return 1; - } while_for_each_ftrace_rec(); - return 0; -} - static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; @@ -1557,18 +1671,55 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; + /* + * If enabling and the REGS flag does not match the REGS_EN, then + * do not ignore this record. Set flags to fail the compare against + * ENABLED. + */ + if (flag && + (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) + flag |= FTRACE_FL_REGS; + /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) return FTRACE_UPDATE_IGNORE; if (flag) { - if (update) + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { rec->flags |= FTRACE_FL_ENABLED; - return FTRACE_UPDATE_MAKE_CALL; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function or + * vice versa. + */ + if (flag & FTRACE_FL_ENABLED) + return FTRACE_UPDATE_MAKE_CALL; + + return FTRACE_UPDATE_MODIFY_CALL; } - if (update) - rec->flags &= ~FTRACE_FL_ENABLED; + if (update) { + /* If there's no more users, clear all flags */ + if (!(rec->flags & ~FTRACE_FL_MASK)) + rec->flags = 0; + else + /* Just disable the record (keep REGS state) */ + rec->flags &= ~FTRACE_FL_ENABLED; + } return FTRACE_UPDATE_MAKE_NOP; } @@ -1600,13 +1751,53 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + unsigned long ftrace_old_addr; unsigned long ftrace_addr; int ret; - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = ftrace_get_addr_new(rec); + + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); ret = ftrace_update_record(rec, enable); @@ -1619,12 +1810,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); + + case FTRACE_UPDATE_MODIFY_CALL: + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int update) +void __weak ftrace_replace_code(int enable) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1634,7 +1828,7 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - failed = __ftrace_replace_code(rec, update); + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1752,22 +1946,55 @@ int __weak ftrace_arch_code_modify_post_process(void) return 0; } -static int __ftrace_modify_code(void *data) +void ftrace_modify_all_code(int command) { - int *command = data; + int update = command & FTRACE_UPDATE_TRACE_FUNC; + int err = 0; - if (*command & FTRACE_UPDATE_CALLS) + /* + * If the ftrace_caller calls a ftrace_ops func directly, + * we need to make sure that it only traces functions it + * expects to trace. When doing the switch of functions, + * we need to update to the ftrace_ops_list_func first + * before the transition between old and new calls are set, + * as the ftrace_ops_list_func will check the ops hashes + * to make sure the ops are having the right functions + * traced. + */ + if (update) { + err = ftrace_update_ftrace_func(ftrace_ops_list_func); + if (FTRACE_WARN_ON(err)) + return; + } + + if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) + else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (*command & FTRACE_UPDATE_TRACE_FUNC) - ftrace_update_ftrace_func(ftrace_trace_function); + if (update && ftrace_trace_function != ftrace_ops_list_func) { + function_trace_op = set_function_trace_op; + smp_wmb(); + /* If irqs are disabled, we are in stop machine */ + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); + err = ftrace_update_ftrace_func(ftrace_trace_function); + if (FTRACE_WARN_ON(err)) + return; + } - if (*command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); - else if (*command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + if (command & FTRACE_START_FUNC_RET) + err = ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + err = ftrace_disable_ftrace_graph_caller(); + FTRACE_WARN_ON(err); +} + +static int __ftrace_modify_code(void *data) +{ + int *command = data; + + ftrace_modify_all_code(*command); return 0; } @@ -1818,16 +2045,6 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); @@ -1838,6 +2055,11 @@ static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; static int global_start_up; +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -1853,38 +2075,37 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { - bool hash_enable = true; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; + ret = __register_ftrace_function(ops); + if (ret) + return ret; + ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - /* ops marked global share the filter hashes */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - /* Don't update hash if global is already set */ - if (global_start_up) - hash_enable = false; - global_start_up++; - } - ops->flags |= FTRACE_OPS_FL_ENABLED; - if (hash_enable) - ftrace_hash_rec_enable(ops, 1); + + ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); return 0; } -static void ftrace_shutdown(struct ftrace_ops *ops, int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command) { - bool hash_disable = true; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; ftrace_start_up--; /* @@ -1894,21 +2115,9 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - global_start_up--; - WARN_ON_ONCE(global_start_up < 0); - /* Don't update hash if global still has users */ - if (global_start_up) { - WARN_ON_ONCE(!ftrace_start_up); - hash_disable = false; - } - } - - if (hash_disable) - ftrace_hash_rec_disable(ops, 1); + ftrace_hash_rec_disable(ops, 1); - if (ops != &global_ops || !global_start_up) + if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -1918,10 +2127,42 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) - return; + if (!command || !ftrace_enabled) { + /* + * If these are control ops, they still need their + * per_cpu field freed. Since, function tracing is + * not currently active, we can just free them + * without synchronizing all CPUs. + */ + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + return 0; + } ftrace_run_update_code(command); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + * The same goes for freeing the per_cpu data of the control + * ops. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + schedule_on_each_cpu(ftrace_sync); + + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + } + + return 0; } static void ftrace_startup_sysctl(void) @@ -1947,23 +2188,69 @@ static void ftrace_shutdown_sysctl(void) } static cycle_t ftrace_update_time; -static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ops_traces_mod(struct ftrace_ops *ops) +static inline int ops_traces_mod(struct ftrace_ops *ops) { - struct ftrace_hash *hash; + /* + * Filter_hash being empty will default to trace module. + * But notrace hash requires a test of individual module functions. + */ + return ftrace_hash_empty(ops->filter_hash) && + ftrace_hash_empty(ops->notrace_hash); +} + +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + /* If ops traces all mods, we already accounted for it */ + if (ops_traces_mod(ops)) + return 0; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->filter_hash) && + !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + return 0; - hash = ops->filter_hash; - return ftrace_hash_empty(hash); + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + return 0; + + return 1; } -static int ftrace_update_code(struct module *mod) +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; +} + +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; + unsigned long update_cnt = 0; unsigned long ref = 0; + bool test = false; int i; /* @@ -1977,24 +2264,30 @@ static int ftrace_update_code(struct module *mod) for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && - ops_traces_mod(ops)) - ref++; + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + if (ops_traces_mod(ops)) + ref++; + else + test = true; + } } } start = ftrace_now(raw_smp_processor_id()); - ftrace_update_cnt = 0; - for (pg = ftrace_new_pgs; pg; pg = pg->next) { + for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { + int cnt = ref; + /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - p->flags = ref; + if (test) + cnt += referenced_filters(p); + p->flags = cnt; /* * Do the initial record conversion from mcount jump @@ -2003,7 +2296,7 @@ static int ftrace_update_code(struct module *mod) if (!ftrace_code_disable(mod, p)) break; - ftrace_update_cnt++; + update_cnt++; /* * If the tracing is enabled, go ahead and enable the record. @@ -2014,7 +2307,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up && ref) { + if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) ftrace_bug(failed, p->ip); @@ -2022,11 +2315,9 @@ static int ftrace_update_code(struct module *mod) } } - ftrace_new_pgs = NULL; - stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; - ftrace_update_tot_cnt += ftrace_update_cnt; + ftrace_update_tot_cnt += update_cnt; return 0; } @@ -2118,22 +2409,6 @@ ftrace_allocate_pages(unsigned long num_to_init) return NULL; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) -{ - int cnt; - - if (!num_to_init) { - pr_info("ftrace: No functions to be traced?\n"); - return -1; - } - - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); - - return 0; -} - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -2275,7 +2550,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && - !(rec->flags & ~FTRACE_FL_MASK))) { + !(rec->flags & FTRACE_FL_ENABLED))) { rec = NULL; goto retry; @@ -2294,7 +2569,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -2377,8 +2652,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)", - rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, " (%ld)%s", + rec->flags & ~FTRACE_FL_MASK, + rec->flags & FTRACE_FL_REGS ? " R" : ""); seq_printf(m, "\n"); return 0; @@ -2395,57 +2671,35 @@ static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; } - return ret; + return iter ? 0 : -ENOMEM; } static int ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; } - return ret; + return iter ? 0 : -ENOMEM; } static void ftrace_filter_reset(struct ftrace_hash *hash) @@ -2468,7 +2722,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -2479,6 +2733,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; int ret = 0; + ftrace_ops_init(ops); + if (unlikely(ftrace_disabled)) return -ENODEV; @@ -2491,28 +2747,26 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, return -ENOMEM; } + iter->ops = ops; + iter->flags = flag; + + mutex_lock(&ops->regex_lock); + if (flag & FTRACE_ITER_NOTRACE) hash = ops->notrace_hash; else hash = ops->filter_hash; - iter->ops = ops; - iter->flags = flag; - if (file->f_mode & FMODE_WRITE) { - mutex_lock(&ftrace_lock); iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); - mutex_unlock(&ftrace_lock); - if (!iter->hash) { trace_parser_put(&iter->parser); kfree(iter); - return -ENOMEM; + ret = -ENOMEM; + goto out_unlock; } } - mutex_lock(&ftrace_regex_lock); - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_filter_reset(iter->hash); @@ -2532,7 +2786,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } } else file->private_data = iter; - mutex_unlock(&ftrace_regex_lock); + + out_unlock: + mutex_unlock(&ops->regex_lock); return ret; } @@ -2540,7 +2796,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, inode, file); } @@ -2548,21 +2806,10 @@ ftrace_filter_open(struct inode *inode, struct file *file) static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, - inode, file); -} - -loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int origin) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, origin); - else - file->f_pos = ret = 1; + struct ftrace_ops *ops = inode->i_private; - return ret; + return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, + inode, file); } static int ftrace_match(char *str, char *regex, int len, int type) @@ -2746,14 +2993,13 @@ static int __init ftrace_mod_cmd_init(void) { return register_ftrace_command(&ftrace_mod_cmd); } -device_initcall(ftrace_mod_cmd_init); +core_initcall(ftrace_mod_cmd_init); -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_func_probe *entry; struct hlist_head *hhd; - struct hlist_node *n; unsigned long key; key = hash_long(ip, FTRACE_HASH_BITS); @@ -2769,7 +3015,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, n, hhd, node) { + hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } @@ -2779,6 +3025,8 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, + .flags = FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(trace_probe_ops) }; static int ftrace_probe_registered; @@ -2788,8 +3036,12 @@ static void __enable_ftrace_function_probe(void) int ret; int i; - if (ftrace_probe_registered) + if (ftrace_probe_registered) { + /* still need to update the function call sites */ + if (ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); return; + } for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -2800,16 +3052,13 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - ret = __register_ftrace_function(&trace_probe_ops); - if (!ret) - ret = ftrace_startup(&trace_probe_ops, 0); + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { - int ret; int i; if (!ftrace_probe_registered) @@ -2822,36 +3071,33 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - ret = __unregister_ftrace_function(&trace_probe_ops); - if (!ret) - ftrace_shutdown(&trace_probe_ops, 0); + ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry) { - struct ftrace_func_probe *entry = - container_of(rhp, struct ftrace_func_probe, rcu); - if (entry->ops->free) - entry->ops->free(&entry->data); + entry->ops->free(entry->ops, entry->ip, &entry->data); kfree(entry); } - int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; int type, len, not; unsigned long key; int count = 0; char *search; + int ret; type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); @@ -2860,10 +3106,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&ftrace_lock); + mutex_lock(&trace_probe_ops.regex_lock); - if (unlikely(ftrace_disabled)) - goto out_unlock; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) { + count = -ENOMEM; + goto out; + } + + if (unlikely(ftrace_disabled)) { + count = -ENODEV; + goto out; + } + + mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { @@ -2887,14 +3143,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->callback) { - if (ops->callback(rec->ip, &entry->data) < 0) { + if (ops->init) { + if (ops->init(ops, rec->ip, &entry->data) < 0) { /* caller does not like this func */ kfree(entry); continue; } } + ret = enter_record(hash, rec, 0); + if (ret < 0) { + kfree(entry); + count = ret; + goto out_unlock; + } + entry->ops = ops; entry->ip = rec->ip; @@ -2902,10 +3165,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); } while_for_each_ftrace_rec(); + + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + if (ret < 0) + count = ret; + __enable_ftrace_function_probe(); out_unlock: mutex_unlock(&ftrace_lock); + out: + mutex_unlock(&trace_probe_ops.regex_lock); + free_ftrace_hash(hash); return count; } @@ -2919,8 +3190,13 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; - struct hlist_node *n, *tmp; + struct ftrace_func_probe *p; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct list_head free_list; + struct ftrace_hash *hash; + struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; int i, len = 0; @@ -2939,11 +3215,19 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&ftrace_lock); + mutex_lock(&trace_probe_ops.regex_lock); + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + /* Hmm, should report this somehow */ + goto out_unlock; + + INIT_LIST_HEAD(&free_list); + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; - hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { + hlist_for_each_entry_safe(entry, tmp, hhd, node) { /* break up if statements for readability */ if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) @@ -2960,12 +3244,32 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); + rec_entry = ftrace_lookup_ip(hash, entry->ip); + /* It is possible more than one entry had this ip */ + if (rec_entry) + free_hash_entry(hash, rec_entry); + + hlist_del_rcu(&entry->node); + list_add(&entry->free_list, &free_list); } } + mutex_lock(&ftrace_lock); __disable_ftrace_function_probe(); + /* + * Remove after the disable is called. Otherwise, if the last + * probe is removed, a null hash means *all enabled*. + */ + ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + synchronize_sched(); + list_for_each_entry_safe(entry, p, &free_list, free_list) { + list_del(&entry->free_list); + ftrace_free_entry(entry); + } mutex_unlock(&ftrace_lock); + + out_unlock: + mutex_unlock(&trace_probe_ops.regex_lock); + free_ftrace_hash(hash); } void @@ -2990,7 +3294,11 @@ void unregister_ftrace_function_probe_all(char *glob) static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); -int register_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; int ret = 0; @@ -3009,7 +3317,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd) return ret; } -int unregister_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; int ret = -ENODEV; @@ -3074,18 +3386,17 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_regex_lock); - - ret = -ENODEV; - if (unlikely(ftrace_disabled)) - goto out_unlock; - if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; iter = m->private; } else iter = file->private_data; + if (unlikely(ftrace_disabled)) + return -ENODEV; + + /* iter->hash is a local copy, so we don't need regex_lock */ + parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); @@ -3094,14 +3405,12 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = ftrace_process_regex(iter->hash, parser->buffer, parser->idx, enable); trace_parser_clear(parser); - if (ret) - goto out_unlock; + if (ret < 0) + goto out; } ret = read; -out_unlock: - mutex_unlock(&ftrace_regex_lock); - + out: return ret; } @@ -3120,49 +3429,112 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable) +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + if (!ftrace_location(ip)) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + return add_hash_entry(hash, ip); +} + +static void ftrace_ops_update_code(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; int ret; - /* All global ops uses the global ops filters */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) - ops = &global_ops; - if (unlikely(ftrace_disabled)) return -ENODEV; + mutex_lock(&ops->regex_lock); + if (enable) orig_hash = &ops->filter_hash; else orig_hash = &ops->notrace_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); - if (!hash) - return -ENOMEM; + if (!hash) { + ret = -ENOMEM; + goto out_regex_unlock; + } - mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); - if (buf) - ftrace_match_records(hash, buf, len); + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } + if (ip) { + ret = ftrace_match_addr(hash, ip, remove); + if (ret < 0) + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(ops); mutex_unlock(&ftrace_lock); - mutex_unlock(&ftrace_regex_lock); + out_regex_unlock: + mutex_unlock(&ops->regex_lock); free_ftrace_hash(hash); return ret; } +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, + int reset, int enable) +{ + return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @ops - the ops to set the filter with @@ -3173,10 +3545,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 1); + ftrace_ops_init(ops); + return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3191,15 +3564,15 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 0); + ftrace_ops_init(ops); + return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with + * ftrace_set_global_filter - set a function to filter on with global tracers * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -3214,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with + * ftrace_set_global_notrace - set a function to not trace with global tracers * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -3237,23 +3609,28 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + static int __init set_ftrace_notrace(char *str) { - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + ftrace_filter_param = true; + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + ftrace_filter_param = true; + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); static int __init set_graph_function(char *str) { @@ -3271,7 +3648,7 @@ static void __init set_ftrace_early_graph(char *buf) func = strsep(&buf, ","); /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - func); + FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3284,6 +3661,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; + ftrace_ops_init(ops); + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -3311,10 +3690,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) int filter_hash; int ret; - mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { iter = m->private; - seq_release(inode, file); } else iter = file->private_data; @@ -3327,6 +3704,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); + mutex_lock(&iter->ops->regex_lock); + if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); @@ -3338,16 +3717,16 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(iter->ops); mutex_unlock(&ftrace_lock); } + + mutex_unlock(&iter->ops->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); - mutex_unlock(&ftrace_regex_lock); return 0; } @@ -3369,7 +3748,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -3377,7 +3756,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -3386,15 +3765,25 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; -int ftrace_graph_filter_enabled; +int ftrace_graph_notrace_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { + unsigned long *table; + size_t size; + int *count; + const struct seq_operations *seq_ops; +}; static void * __g_next(struct seq_file *m, loff_t *pos) { - if (*pos >= ftrace_graph_count) + struct ftrace_graph_data *fgd = m->private; + + if (*pos >= *fgd->count) return NULL; - return &ftrace_graph_funcs[*pos]; + return &fgd->table[*pos]; } static void * @@ -3406,10 +3795,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos) static void *g_start(struct seq_file *m, loff_t *pos) { + struct ftrace_graph_data *fgd = m->private; + mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_filter_enabled && !*pos) + if (!*fgd->count && !*pos) return (void *)1; return __g_next(m, pos); @@ -3445,38 +3836,88 @@ static const struct seq_operations ftrace_graph_seq_ops = { }; static int -ftrace_graph_open(struct inode *inode, struct file *file) +__ftrace_graph_open(struct inode *inode, struct file *file, + struct ftrace_graph_data *fgd) { int ret = 0; - if (unlikely(ftrace_disabled)) - return -ENODEV; - mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ftrace_graph_filter_enabled = 0; - ftrace_graph_count = 0; - memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + *fgd->count = 0; + memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); } mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_graph_seq_ops); + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, fgd->seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = fgd; + } + } else + file->private_data = fgd; return ret; } static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_notrace_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_notrace_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int ftrace_graph_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + + kfree(m->private); seq_release(inode, file); + } else { + kfree(file->private_data); + } + return 0; } static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -3489,7 +3930,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + if (!not && *idx >= size) return -EBUSY; search_len = strlen(search); @@ -3517,7 +3958,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) fail = 0; if (!exists) { array[(*idx)++] = rec->ip; - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + if (*idx >= size) goto out; } } else { @@ -3535,7 +3976,6 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = 1; return 0; } @@ -3544,36 +3984,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - ssize_t read, ret; + ssize_t read, ret = 0; + struct ftrace_graph_data *fgd = file->private_data; if (!cnt) return 0; - mutex_lock(&graph_lock); - - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { - ret = -ENOMEM; - goto out_unlock; - } + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) + return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; + mutex_lock(&graph_lock); + /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - parser.buffer); - if (ret) - goto out_free; + ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, + parser.buffer); + + mutex_unlock(&graph_lock); } - ret = read; + if (!ret) + ret = read; -out_free: trace_parser_put(&parser); -out_unlock: - mutex_unlock(&graph_lock); return ret; } @@ -3582,11 +4019,49 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, + .llseek = tracing_lseek, + .release = ftrace_graph_release, +}; + +static const struct file_operations ftrace_graph_notrace_fops = { + .open = ftrace_graph_notrace_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = tracing_lseek, .release = ftrace_graph_release, - .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent) +{ + + trace_create_file("set_ftrace_filter", 0644, parent, + ops, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", 0644, parent, + ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actualy delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ + mutex_lock(&ftrace_lock); + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ftrace_shutdown(ops, 0); + ops->flags |= FTRACE_OPS_FL_DELETED; + mutex_unlock(&ftrace_lock); +} + static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { @@ -3596,37 +4071,50 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("enabled_functions", 0444, d_tracer, NULL, &ftrace_enabled_fops); - trace_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - - trace_create_file("set_ftrace_notrace", 0644, d_tracer, - NULL, &ftrace_notrace_fops); + ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); + trace_create_file("set_graph_notrace", 0444, d_tracer, + NULL, + &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; } -static void ftrace_swap_recs(void *a, void *b, int size) +static int ftrace_cmp_ips(const void *a, const void *b) { - struct dyn_ftrace *reca = a; - struct dyn_ftrace *recb = b; - struct dyn_ftrace t; + const unsigned long *ipa = a; + const unsigned long *ipb = b; - t = *reca; - *reca = *recb; - *recb = t; + if (*ipa > *ipb) + return 1; + if (*ipa < *ipb) + return -1; + return 0; +} + +static void ftrace_swap_ips(void *a, void *b, int size) +{ + unsigned long *ipa = a; + unsigned long *ipb = b; + unsigned long t; + + t = *ipa; + *ipa = *ipb; + *ipb = t; } static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *start_pg; struct ftrace_page *pg; + struct dyn_ftrace *rec; unsigned long count; unsigned long *p; unsigned long addr; @@ -3638,8 +4126,11 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - pg = ftrace_allocate_pages(count); - if (!pg) + sort(start, count, sizeof(*start), + ftrace_cmp_ips, ftrace_swap_ips); + + start_pg = ftrace_allocate_pages(count); + if (!start_pg) return -ENOMEM; mutex_lock(&ftrace_lock); @@ -3652,7 +4143,7 @@ static int ftrace_process_locs(struct module *mod, if (!mod) { WARN_ON(ftrace_pages || ftrace_pages_start); /* First initialization */ - ftrace_pages = ftrace_pages_start = pg; + ftrace_pages = ftrace_pages_start = start_pg; } else { if (!ftrace_pages) goto out; @@ -3663,11 +4154,11 @@ static int ftrace_process_locs(struct module *mod, ftrace_pages = ftrace_pages->next; } - ftrace_pages->next = pg; - ftrace_pages = pg; + ftrace_pages->next = start_pg; } p = start; + pg = start_pg; while (p < end) { addr = ftrace_call_adjust(*p++); /* @@ -3678,17 +4169,23 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - if (!ftrace_record_ip(addr)) - break; + + if (pg->index == pg->size) { + /* We should have allocated enough */ + if (WARN_ON(!pg->next)) + break; + pg = pg->next; + } + + rec = &pg->records[pg->index++]; + rec->ip = addr; } - /* These new locations need to be initialized */ - ftrace_new_pgs = pg; + /* We should have used all pages */ + WARN_ON(pg->next); - /* Make each individual set of pages sorted by ips */ - for (; pg; pg = pg->next) - sort(pg->records, pg->index, sizeof(struct dyn_ftrace), - ftrace_cmp_recs, ftrace_swap_recs); + /* Assign the last page to ftrace_pages */ + ftrace_pages = pg; /* * We only need to disable interrupts on start up @@ -3700,7 +4197,7 @@ static int ftrace_process_locs(struct module *mod, */ if (!mod) local_irq_save(flags); - ftrace_update_code(mod); + ftrace_update_code(mod, start_pg); if (!mod) local_irq_restore(flags); ret = 0; @@ -3764,61 +4261,57 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +void ftrace_module_init(struct module *mod) +{ + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); +} + +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; - switch (val) { - case MODULE_STATE_COMING: - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); - break; - case MODULE_STATE_GOING: + if (val == MODULE_STATE_GOING) ftrace_release_mod(mod); - break; - } return 0; } #else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify_exit(struct notifier_block *self, + unsigned long val, void *data) { return 0; } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = 0, +struct notifier_block ftrace_module_exit_nb = { + .notifier_call = ftrace_module_notify_exit, + .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; -extern unsigned long __start_mcount_loc[]; -extern unsigned long __stop_mcount_loc[]; - void __init ftrace_init(void) { - unsigned long count, addr, flags; + extern unsigned long __start_mcount_loc[]; + extern unsigned long __stop_mcount_loc[]; + unsigned long count, flags; int ret; - /* Keep the ftrace pointer to the stub */ - addr = (unsigned long)ftrace_stub; - local_irq_save(flags); - ftrace_dyn_arch_init(&addr); + ret = ftrace_dyn_arch_init(); local_irq_restore(flags); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) + if (ret) goto failed; count = __stop_mcount_loc - __start_mcount_loc; - - ret = ftrace_dyn_table_alloc(count); - if (ret) + if (!count) { + pr_info("ftrace: No functions to be traced?\n"); goto failed; + } + + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, count / ENTRIES_PER_PAGE + 1); last_ftrace_enabled = ftrace_enabled = 1; @@ -3826,9 +4319,9 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); + ret = register_module_notifier(&ftrace_module_exit_nb); if (ret) - pr_warning("Failed to register trace ftrace module notifier\n"); + pr_warning("Failed to register trace ftrace module exit notifier\n"); set_ftrace_early_filters(); @@ -3841,6 +4334,8 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -3848,51 +4343,162 @@ static int __init ftrace_nodyn_init(void) ftrace_enabled = 1; return 0; } -device_initcall(ftrace_nodyn_init); +core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - 0; \ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ }) -# define ftrace_shutdown(ops, command) do { } while (0) + # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + /* Only the top level instance does pid tracing */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + + /* + * Control funcs (perf) uses RCU. Only trace if + * RCU is currently active. + */ + if (!rcu_is_watching()) + goto out; + + do_for_each_ftrace_op(op, ftrace_control_list) { + if (!(op->flags & FTRACE_OPS_FL_STUB) && + !ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip, regs)) + op->func(ip, parent_ip, op, regs); + } while_for_each_ftrace_op(op); + out: + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + INIT_REGEX_LOCK(control_ops) +}; + +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + int bit; + + if (function_trace_stop) + return; - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) return; - trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { - if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); - op = rcu_dereference_raw(op->next); - }; + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (ftrace_ops_test(op, ip, regs)) { + if (WARN_ON(!op->func)) { + function_trace_stop = 1; + printk("op=%p %pS\n", op, op); + goto out; + } + op->func(ip, parent_ip, op, regs); + } + } while_for_each_ftrace_op(op); +out: preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); + trace_clear_recursion(bit); +} + +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, regs); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +#endif static void clear_ftrace_swapper(void) { @@ -4114,7 +4720,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, if (strlen(tmp) == 0) return 1; - ret = strict_strtol(tmp, 10, &val); + ret = kstrtol(tmp, 10, &val); if (ret < 0) return ret; @@ -4136,7 +4742,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = tracing_lseek, .release = ftrace_pid_release, }; @@ -4196,18 +4802,14 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret = -1; - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out_unlock; + ftrace_ops_init(ops); - ret = __register_ftrace_function(ops); - if (!ret) - ret = ftrace_startup(ops, 0); + mutex_lock(&ftrace_lock); + ret = ftrace_startup(ops, 0); - out_unlock: mutex_unlock(&ftrace_lock); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); @@ -4223,9 +4825,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_lock); - ret = __unregister_ftrace_function(ops); - if (!ret) - ftrace_shutdown(ops, 0); + ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -4256,12 +4856,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) { - if (ftrace_ops_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_ops_list->func; - else - ftrace_trace_function = ftrace_ops_list_func; - } + if (ftrace_ops_list != &ftrace_list_end) + update_ftrace_function(); } else { /* stopping ftrace calls (just send to ftrace_stub) */ @@ -4278,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -4289,6 +4884,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -4423,6 +5019,34 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list == &global_ops && + global_ops.next == &ftrace_list_end)) + ftrace_graph_entry = __ftrace_graph_entry; + else + ftrace_graph_entry = ftrace_graph_entry_test; +} + +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -4436,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); ftrace_graph_active++; @@ -4447,7 +5070,19 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, } ftrace_graph_return = retfunc; - ftrace_graph_entry = entryfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); + + /* Function graph doesn't use the .func field of global_ops */ + global_ops.flags |= FTRACE_OPS_FL_STUB; ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); @@ -4466,7 +5101,9 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + global_ops.flags &= ~FTRACE_OPS_FL_STUB; unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index f55fcf61b22..1c71382b283 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,8 +13,5 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -#ifdef EVENT_POWER_TRACING_DEPRECATED -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5b7b5c1195..ff7027199a9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,16 +3,21 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/ftrace_event.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> +#include <linux/trace_seq.h> #include <linux/spinlock.h> +#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/kthread.h> /* for self test */ #include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> @@ -21,7 +26,8 @@ #include <linux/fs.h> #include <asm/local.h> -#include "trace.h" + +static void update_pages_handler(struct work_struct *work); /* * The ring buffer header is special. We must manually up keep it. @@ -30,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "# compressed entry header\n"); - ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_printf(s, "\tarray : 32 bits\n"); - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_puts(s, "# compressed entry header\n"); + ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_puts(s, "\tarray : 32 bits\n"); + ret = trace_seq_putc(s, '\n'); ret = trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", @@ -154,33 +160,10 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF (1 << 20) -/** - * tracing_on - enable all tracing buffers - * - * This function enables all tracing buffers that may have been - * disabled with tracing_off. - */ -void tracing_on(void) -{ - set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_on); - -/** - * tracing_off - turn off all tracing buffers - * - * This function stops all tracing buffers from recording data. - * It does not disable any overhead the tracers themselves may - * be causing. This function simply causes all recording to - * the ring buffers to fail. - */ -void tracing_off(void) -{ - clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_off); +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) /** * tracing_off_permanent - permanently disable ring buffers @@ -193,21 +176,12 @@ void tracing_off_permanent(void) set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } -/** - * tracing_is_on - show state of ring buffers enabled - */ -int tracing_is_on(void) -{ - return ring_buffer_flags == RB_BUFFERS_ON; -} -EXPORT_SYMBOL_GPL(tracing_is_on); - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS # define RB_FORCE_8BYTE_ALIGNMENT 0 # define RB_ARCH_ALIGNMENT RB_ALIGNMENT #else @@ -215,6 +189,8 @@ EXPORT_SYMBOL_GPL(tracing_is_on); # define RB_ARCH_ALIGNMENT 8U #endif +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -363,7 +339,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ - unsigned char data[]; /* data of buffer page */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; /* @@ -471,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) return ret; } +struct rb_irq_work { + struct irq_work work; + wait_queue_head_t waiters; + bool waiters_pending; +}; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -481,6 +463,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + unsigned int nr_pages; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -489,22 +472,30 @@ struct ring_buffer_per_cpu { unsigned long lost_events; unsigned long last_overrun; local_t entries_bytes; - local_t commit_overrun; - local_t overrun; local_t entries; + local_t overrun; + local_t commit_overrun; + local_t dropped_events; local_t committing; local_t commits; unsigned long read; unsigned long read_bytes; u64 write_stamp; u64 read_stamp; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ + int nr_pages_to_update; + struct list_head new_pages; /* new pages to add */ + struct work_struct update_pages_work; + struct completion update_done; + + struct rb_irq_work irq_work; }; struct ring_buffer { - unsigned pages; unsigned flags; int cpus; atomic_t record_disabled; + atomic_t resize_disabled; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -517,6 +508,8 @@ struct ring_buffer { struct notifier_block cpu_notify; #endif u64 (*clock)(void); + + struct rb_irq_work irq_work; }; struct ring_buffer_iter { @@ -528,6 +521,120 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ + struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + + wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + DEFINE_WAIT(wait); + struct rb_irq_work *work; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) + schedule(); + + finish_wait(&work->waiters, &wait); + return 0; +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + work->waiters_pending = true; + poll_wait(filp, &work->waiters, poll_table); + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + return 0; +} + /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ @@ -958,7 +1065,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, } /** - * check_pages - integrity check of buffer pages + * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not @@ -969,6 +1076,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + /* Reset the head page if it exists */ + if (cpu_buffer->head_page) + rb_set_head_page(cpu_buffer); + rb_head_page_deactivate(cpu_buffer); if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -995,14 +1106,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) +static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) { + int i; struct buffer_page *bpage, *tmp; - LIST_HEAD(pages); - unsigned i; - - WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1013,15 +1120,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu_buffer->cpu)); + cpu_to_node(cpu)); if (!bpage) goto free_pages; - rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, pages); - list_add(&bpage->list, &pages); - - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; @@ -1029,6 +1134,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } + return 0; + +free_pages: + list_for_each_entry_safe(bpage, tmp, pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + + return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + LIST_HEAD(pages); + + WARN_ON(!nr_pages); + + if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + return -ENOMEM; + /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to @@ -1037,20 +1163,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->pages = pages.next; list_del(&pages); + cpu_buffer->nr_pages = nr_pages; + rb_check_pages(cpu_buffer); return 0; - - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - return -ENOMEM; } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1067,6 +1188,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); + init_completion(&cpu_buffer->update_done); + init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&cpu_buffer->irq_work.waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1083,8 +1208,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); - ret = rb_allocate_pages(cpu_buffer, buffer->pages); + ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; @@ -1131,7 +1257,7 @@ static int rb_cpu_notify(struct notifier_block *self, #endif /** - * ring_buffer_alloc - allocate a new ring_buffer + * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @@ -1145,7 +1271,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, { struct ring_buffer *buffer; int bsize; - int cpu; + int cpu, nr_pages; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1156,14 +1282,17 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; + init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&buffer->irq_work.waiters); + /* need at least two pages */ - if (buffer->pages < 2) - buffer->pages = 2; + if (nr_pages < 2) + nr_pages = 2; /* * In case of non-hotplug cpu, if the ring-buffer is allocated @@ -1171,7 +1300,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, * In that off case, we need to allocate for all possible cpus. */ #ifdef CONFIG_HOTPLUG_CPU - get_online_cpus(); + cpu_notifier_register_begin(); cpumask_copy(buffer->cpumask, cpu_online_mask); #else cpumask_copy(buffer->cpumask, cpu_possible_mask); @@ -1186,7 +1315,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; } @@ -1194,10 +1323,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, #ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; - register_cpu_notifier(&buffer->cpu_notify); + __register_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_done(); #endif - put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -1211,7 +1340,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif fail_free_buffer: kfree(buffer); @@ -1228,16 +1359,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; - get_online_cpus(); - #ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&buffer->cpu_notify); #endif for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -1254,77 +1386,241 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); -static void -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +static inline unsigned long rb_page_entries(struct buffer_page *bpage) { - struct buffer_page *bpage; - struct list_head *p; - unsigned i; + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static int +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +{ + struct list_head *tail_page, *to_remove, *next_page; + struct buffer_page *to_remove_page, *tmp_iter_page; + struct buffer_page *last_page, *first_page; + unsigned int nr_removed; + unsigned long head_bit; + int page_entries; + + head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); + atomic_inc(&cpu_buffer->record_disabled); + /* + * We don't race with the readers since we have acquired the reader + * lock. We also don't race with writers after disabling recording. + * This makes it easy to figure out the first and the last page to be + * removed from the list. We unlink all the pages in between including + * the first and last pages. This is done in a busy loop so that we + * lose the least number of traces. + * The pages are freed after we restart recording and unlock readers. + */ + tail_page = &cpu_buffer->tail_page->list; - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - p = cpu_buffer->pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - free_buffer_page(bpage); + /* + * tail page might be on reader page, we remove the next page + * from the ring buffer + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + tail_page = rb_list_head(tail_page->next); + to_remove = tail_page; + + /* start of pages to remove */ + first_page = list_entry(rb_list_head(to_remove->next), + struct buffer_page, list); + + for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { + to_remove = rb_list_head(to_remove)->next; + head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); + next_page = rb_list_head(to_remove)->next; -out: + /* + * Now we remove all pages between tail_page and next_page. + * Make sure that we have head_bit value preserved for the + * next page + */ + tail_page->next = (struct list_head *)((unsigned long)next_page | + head_bit); + next_page = rb_list_head(next_page); + next_page->prev = tail_page; + + /* make sure pages points to a valid page in the ring buffer */ + cpu_buffer->pages = next_page; + + /* update head page */ + if (head_bit) + cpu_buffer->head_page = list_entry(next_page, + struct buffer_page, list); + + /* + * change read pointer to make sure any read iterators reset + * themselves + */ + cpu_buffer->read = 0; + + /* pages are removed, resume tracing and then free the pages */ + atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + + /* last buffer page to remove */ + last_page = list_entry(rb_list_head(to_remove), struct buffer_page, + list); + tmp_iter_page = first_page; + + do { + to_remove_page = tmp_iter_page; + rb_inc_page(cpu_buffer, &tmp_iter_page); + + /* update the counters */ + page_entries = rb_page_entries(to_remove_page); + if (page_entries) { + /* + * If something was added to this page, it was full + * since it is not the tail page. So we deduct the + * bytes consumed in ring buffer from here. + * Increment overrun to account for the lost events. + */ + local_add(page_entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + } + + /* + * We have already removed references to this list item, just + * free up the buffer_page and its page + */ + free_buffer_page(to_remove_page); + nr_removed--; + + } while (to_remove_page != last_page); + + RB_WARN_ON(cpu_buffer, nr_removed); + + return nr_removed == 0; } -static void -rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *pages, unsigned nr_pages) +static int +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *bpage; - struct list_head *p; - unsigned i; + struct list_head *pages = &cpu_buffer->new_pages; + int retries, success; raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); + /* + * We are holding the reader lock, so the reader page won't be swapped + * in the ring buffer. Now we are racing with the writer trying to + * move head page and the tail page. + * We are going to adapt the reader page update process where: + * 1. We first splice the start and end of list of new pages between + * the head page and its previous page. + * 2. We cmpxchg the prev_page->next to point from head page to the + * start of new pages list. + * 3. Finally, we update the head->prev to the end of new list. + * + * We will try this process 10 times, to make sure that we don't keep + * spinning. + */ + retries = 10; + success = 0; + while (retries--) { + struct list_head *head_page, *prev_page, *r; + struct list_head *last_page, *first_page; + struct list_head *head_page_with_bit; - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - goto out; - p = pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - list_add_tail(&bpage->list, cpu_buffer->pages); + head_page = &rb_set_head_page(cpu_buffer)->list; + if (!head_page) + break; + prev_page = head_page->prev; + + first_page = pages->next; + last_page = pages->prev; + + head_page_with_bit = (struct list_head *) + ((unsigned long)head_page | RB_PAGE_HEAD); + + last_page->next = head_page_with_bit; + first_page->prev = prev_page; + + r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + + if (r == head_page_with_bit) { + /* + * yay, we replaced the page pointer to our new list, + * now, we just have to update to head page's prev + * pointer to point to end of list + */ + head_page->prev = last_page; + success = 1; + break; + } } - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); -out: + if (success) + INIT_LIST_HEAD(pages); + /* + * If we weren't successful in adding in new pages, warn and stop + * tracing + */ + RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + /* free pages if they weren't inserted */ + if (!success) { + struct buffer_page *bpage, *tmp; + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + int success; + + if (cpu_buffer->nr_pages_to_update > 0) + success = rb_insert_pages(cpu_buffer); + else + success = rb_remove_pages(cpu_buffer, + -cpu_buffer->nr_pages_to_update); + + if (success) + cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ + struct ring_buffer_per_cpu *cpu_buffer = container_of(work, + struct ring_buffer_per_cpu, update_pages_work); + rb_update_pages(cpu_buffer); + complete(&cpu_buffer->update_done); } /** * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. + * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * BUF_PAGE_SIZE. * - * Returns -1 on failure. + * Returns 0 on success and < 0 on failure. */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, + int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *bpage, *tmp; - unsigned long buffer_size; - LIST_HEAD(pages); - int i, cpu; + unsigned nr_pages; + int cpu, err = 0; /* * Always succeed at resizing a non-existent buffer: @@ -1332,115 +1628,186 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (!buffer) return size; + /* Make sure the requested buffer exists */ + if (cpu_id != RING_BUFFER_ALL_CPUS && + !cpumask_test_cpu(cpu_id, buffer->cpumask)) + return size; + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; - buffer_size = buffer->pages * BUF_PAGE_SIZE; /* we need a minimum of two pages */ if (size < BUF_PAGE_SIZE * 2) size = BUF_PAGE_SIZE * 2; - if (size == buffer_size) - return size; - - atomic_inc(&buffer->record_disabled); + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - /* Make sure all writers are done with this buffer. */ - synchronize_sched(); + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&buffer->resize_disabled)) + return -EBUSY; + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); - get_online_cpus(); - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - - if (size < buffer_size) { + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* calculate the pages to update */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; - /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) - goto out_fail; + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + /* + * nothing more to do for removing pages or no update + */ + if (cpu_buffer->nr_pages_to_update <= 0) + continue; + /* + * to add pages, make sure all new pages can be + * allocated without receiving ENOMEM + */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto out_err; + } + } - rm_pages = buffer->pages - nr_pages; + get_online_cpus(); + /* + * Fire off all the required work handlers + * We can't schedule on offline CPUs, but it's not necessary + * since we can change their buffer sizes without any race. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + /* The update must run on the CPU that is being updated. */ + preempt_disable(); + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rb_update_pages(cpu_buffer); + cpu_buffer->nr_pages_to_update = 0; + } else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + preempt_disable(); + } + preempt_enable(); + } + /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - rb_remove_pages(cpu_buffer, rm_pages); + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + wait_for_completion(&cpu_buffer->update_done); + cpu_buffer->nr_pages_to_update = 0; } - goto out; - } - /* - * This is a bit more difficult. We only want to add pages - * when we can allocate enough for all CPUs. We do this - * by allocating all the pages and storing them on a local - * link list. If we succeed in our allocation, then we - * add these pages to the cpu_buffers. Otherwise we just free - * them all and return -ENOMEM; - */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) - goto out_fail; + put_online_cpus(); + } else { + /* Make sure this CPU has been intitialized */ + if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) + goto out; - new_pages = nr_pages - buffer->pages; + cpu_buffer = buffer->buffers[cpu_id]; - for_each_buffer_cpu(buffer, cpu) { - for (i = 0; i < new_pages; i++) { - struct page *page; + if (nr_pages == cpu_buffer->nr_pages) + goto out; + + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (cpu_buffer->nr_pages_to_update > 0 && + __rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu_id)) { + err = -ENOMEM; + goto out_err; + } + + get_online_cpus(); + + preempt_disable(); + /* The update must run on the CPU that is being updated. */ + if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + rb_update_pages(cpu_buffer); + else { /* - * __GFP_NORETRY flag makes sure that the allocation - * fails gracefully without invoking oom-killer and - * the system is not destabilized. + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. */ - bpage = kzalloc_node(ALIGN(sizeof(*bpage), - cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu)); - if (!bpage) - goto free_pages; - list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); + preempt_enable(); + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + preempt_disable(); } - } + preempt_enable(); - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - rb_insert_pages(cpu_buffer, &pages, new_pages); + cpu_buffer->nr_pages_to_update = 0; + put_online_cpus(); } - if (RB_WARN_ON(buffer, !list_empty(&pages))) - goto out_fail; - out: - buffer->pages = nr_pages; - put_online_cpus(); + /* + * The ring buffer resize can happen with the ring buffer + * enabled, so that the update disturbs the tracing as little + * as possible. But if the buffer is disabled, we do not need + * to worry about that, and we can take the time to verify + * that the buffer is not corrupt. + */ + if (atomic_read(&buffer->record_disabled)) { + atomic_inc(&buffer->record_disabled); + /* + * Even though the buffer was disabled, we must make sure + * that it is truly disabled before calling rb_check_pages. + * There could have been a race between checking + * record_disable and incrementing it. + */ + synchronize_sched(); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_check_pages(cpu_buffer); + } + atomic_dec(&buffer->record_disabled); + } + mutex_unlock(&buffer->mutex); + return size; - atomic_dec(&buffer->record_disabled); + out_err: + for_each_buffer_cpu(buffer, cpu) { + struct buffer_page *bpage, *tmp; - return size; + cpu_buffer = buffer->buffers[cpu]; + cpu_buffer->nr_pages_to_update = 0; - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - put_online_cpus(); - mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -ENOMEM; + if (list_empty(&cpu_buffer->new_pages)) + continue; - /* - * Something went totally wrong, and we are too paranoid - * to even clean up the mess. - */ - out_fail: - put_online_cpus(); + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -1; + return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1479,21 +1846,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned long rb_page_write(struct buffer_page *bpage) -{ - return local_read(&bpage->write) & RB_WRITE_MASK; -} - static inline unsigned rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } -static inline unsigned long rb_page_entries(struct buffer_page *bpage) -{ - return local_read(&bpage->entries) & RB_WRITE_MASK; -} - /* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -1542,7 +1899,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: - max_count = cpu_buffer->buffer->pages * 100; + max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != cpu_buffer->tail_page) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) @@ -1626,7 +1983,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) } /** - * ring_buffer_update_event - update event type and data + * rb_update_event - update event type and data * @event: the even to update * @type: the type of event * @length: the size of the event field in the ring buffer @@ -1961,8 +2318,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are not in overwrite mode, * this is easy, just stop here. */ - if (!(buffer->flags & RB_FL_OVERWRITE)) + if (!(buffer->flags & RB_FL_OVERWRITE)) { + local_inc(&cpu_buffer->dropped_events); goto out_reset; + } ret = rb_handle_head_page(cpu_buffer, tail_page, @@ -2040,6 +2399,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - length; + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + delta = 0; + /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, @@ -2201,7 +2567,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (unlikely(test_time_stamp(delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable; + local_clock_stable = sched_clock_stable(); #endif WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", @@ -2233,41 +2599,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); + unsigned int val = this_cpu_read(current_context); + int bit; - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); - - WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; + if (unlikely(val & (1 << bit))) + return 1; - trace_recursive_fail(); + val |= (1 << bit); + this_cpu_write(current_context, val); - return -1; + return 0; } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!trace_recursion_buffer()); + unsigned int val = this_cpu_read(current_context); - trace_recursion_dec(); + val--; + val &= this_cpu_read(current_context); + this_cpu_write(current_context, val); } #else @@ -2375,6 +2776,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_end_commit(cpu_buffer); } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to @@ -2394,6 +2811,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + trace_recursive_unlock(); preempt_enable_notrace(); @@ -2526,8 +2945,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); * and not the length of the event which would hold the header. */ int ring_buffer_write(struct ring_buffer *buffer, - unsigned long length, - void *data) + unsigned long length, + void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -2566,6 +2985,8 @@ int ring_buffer_write(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + ret = 0; out: preempt_enable_notrace(); @@ -2619,6 +3040,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_record_enable); /** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() version + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd | RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() version + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd & ~RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +int ring_buffer_record_is_on(struct ring_buffer *buffer) +{ + return !atomic_read(&buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -2678,12 +3156,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ -unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) { unsigned long flags; struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; - unsigned long ret; + u64 ret = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; @@ -2698,7 +3176,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) bpage = cpu_buffer->reader_page; else bpage = rb_set_head_page(cpu_buffer); - ret = bpage->page->time_stamp; + if (bpage) + ret = bpage->page->time_stamp; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return ret; @@ -2744,7 +3223,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); /** - * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * ring_buffer_overrun_cpu - get the number of overruns caused by the ring + * buffer wrapping around (only if RB_FL_OVERWRITE is on). * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ @@ -2764,7 +3244,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** - * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by + * commits failing due to the buffer wrapping around while there are uncommitted + * events, such as during an interrupt storm. * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from */ @@ -2785,6 +3267,46 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); /** + * ring_buffer_dropped_events_cpu - get the number of dropped events caused by + * the ring buffer filling up (only if RB_FL_OVERWRITE is off). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->dropped_events); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); + +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + +/** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer * @@ -2992,6 +3514,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + /* * Reset the reader page to size zero. */ @@ -3005,6 +3531,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); + if (!reader) + goto out; cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; @@ -3137,7 +3665,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* check for end of page padding */ if ((iter->head >= rb_page_size(iter->head_page)) && (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); + rb_inc_iter(iter); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) @@ -3438,11 +3966,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * expected. * * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. + * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * - * This overall must be paired with ring_buffer_finish. + * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) @@ -3461,6 +3989,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) iter->cpu_buffer = cpu_buffer; + atomic_inc(&buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); return iter; @@ -3490,7 +4019,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); * an intervening ring_buffer_read_prepare_sync must have been * performed. * - * Must be paired with ring_buffer_finish. + * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) @@ -3512,7 +4041,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** - * ring_buffer_finish - finish reading the iterator of the buffer + * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables the recording to the buffer, and frees the @@ -3522,8 +4051,20 @@ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + /* + * Ring buffer is disabled from recording, here's a good place + * to check the integrity of the ring buffer. + * Must prevent readers from trying to read, as the check + * clears the HEAD page and readers require it. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->buffer->resize_disabled); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); @@ -3563,9 +4104,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. */ -unsigned long ring_buffer_size(struct ring_buffer *buffer) +unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) { - return BUF_PAGE_SIZE * buffer->pages; + /* + * Earlier, this method returned + * BUF_PAGE_SIZE * buffer->nr_pages + * Since the nr_pages field is now removed, we have converted this to + * return the per cpu buffer value. + */ + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -3586,14 +4136,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - local_set(&cpu_buffer->commit_overrun, 0); local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->dropped_events, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); @@ -3622,8 +4174,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); + /* Make sure all commits have finished */ + synchronize_sched(); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) @@ -3639,6 +4195,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&buffer->resize_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -3740,8 +4297,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->pages != buffer_b->pages) + if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; ret = -EAGAIN; @@ -3755,9 +4315,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, if (atomic_read(&buffer_b->record_disabled)) goto out; - cpu_buffer_a = buffer_a->buffers[cpu]; - cpu_buffer_b = buffer_b->buffers[cpu]; - if (atomic_read(&cpu_buffer_a->record_disabled)) goto out; @@ -3799,6 +4356,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions @@ -3856,7 +4414,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * to swap with a page in the ring buffer. * * for example: - * rpage = ring_buffer_alloc_read_page(buffer); + * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (!rpage) * return error; * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); @@ -4039,68 +4597,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_TRACING -static ssize_t -rb_simple_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - char buf[64]; - int r; - - if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) - r = sprintf(buf, "permanently disabled\n"); - else - r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -rb_simple_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val) - set_bit(RB_BUFFERS_ON_BIT, p); - else - clear_bit(RB_BUFFERS_ON_BIT, p); - - (*ppos)++; - - return cnt; -} - -static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, - .read = rb_simple_read, - .write = rb_simple_write, - .llseek = default_llseek, -}; - - -static __init int rb_init_debugfs(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - - return 0; -} - -fs_initcall(rb_init_debugfs); -#endif - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) @@ -4108,6 +4604,8 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; + int cpu_i, nr_pages_same; + unsigned int nr_pages; switch (action) { case CPU_UP_PREPARE: @@ -4115,8 +4613,23 @@ static int rb_cpu_notify(struct notifier_block *self, if (cpumask_test_cpu(cpu, buffer->cpumask)) return NOTIFY_OK; + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; + } + } + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %ld\n", cpu); @@ -4139,3 +4652,320 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { + struct ring_buffer *buffer; + unsigned long events; + unsigned long bytes_written; + unsigned long bytes_alloc; + unsigned long bytes_dropped; + unsigned long events_nested; + unsigned long bytes_written_nested; + unsigned long bytes_alloc_nested; + unsigned long bytes_dropped_nested; + int min_size_nested; + int max_size_nested; + int max_size; + int min_size; + int cpu; + int cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE 1048576 + +static char rb_string[] __initdata = + "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" + "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" + "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { + int size; + char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ + struct ring_buffer_event *event; + struct rb_item *item; + bool started; + int event_len; + int size; + int len; + int cnt; + + /* Have nested writes different that what is written */ + cnt = data->cnt + (nested ? 27 : 0); + + /* Multiply cnt by ~e, to make some unique increment */ + size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + + len = size + sizeof(struct rb_item); + + started = rb_test_started; + /* read rb_test_started before checking buffer enabled */ + smp_rmb(); + + event = ring_buffer_lock_reserve(data->buffer, len); + if (!event) { + /* Ignore dropped events before test starts. */ + if (started) { + if (nested) + data->bytes_dropped += len; + else + data->bytes_dropped_nested += len; + } + return len; + } + + event_len = ring_buffer_event_length(event); + + if (RB_WARN_ON(data->buffer, event_len < len)) + goto out; + + item = ring_buffer_event_data(event); + item->size = size; + memcpy(item->str, rb_string, size); + + if (nested) { + data->bytes_alloc_nested += event_len; + data->bytes_written_nested += len; + data->events_nested++; + if (!data->min_size_nested || len < data->min_size_nested) + data->min_size_nested = len; + if (len > data->max_size_nested) + data->max_size_nested = len; + } else { + data->bytes_alloc += event_len; + data->bytes_written += len; + data->events++; + if (!data->min_size || len < data->min_size) + data->max_size = len; + if (len > data->max_size) + data->max_size = len; + } + + out: + ring_buffer_unlock_commit(data->buffer, event); + + return 0; +} + +static __init int rb_test(void *arg) +{ + struct rb_test_data *data = arg; + + while (!kthread_should_stop()) { + rb_write_something(data, false); + data->cnt++; + + set_current_state(TASK_INTERRUPTIBLE); + /* Now sleep between a min of 100-300us and a max of 1ms */ + usleep_range(((data->cnt % 3) + 1) * 100, 1000); + } + + return 0; +} + +static __init void rb_ipi(void *ignore) +{ + struct rb_test_data *data; + int cpu = smp_processor_id(); + + data = &rb_data[cpu]; + rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ + while (!kthread_should_stop()) { + + /* Send an IPI to all cpus to write data! */ + smp_call_function(rb_ipi, NULL, 1); + /* No sleep, but for non preempt, let others run */ + schedule(); + } + + return 0; +} + +static __init int test_ringbuffer(void) +{ + struct task_struct *rb_hammer; + struct ring_buffer *buffer; + int cpu; + int ret = 0; + + pr_info("Running ring buffer tests...\n"); + + buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); + if (WARN_ON(!buffer)) + return 0; + + /* Disable buffer so that threads can't write to it yet */ + ring_buffer_record_off(buffer); + + for_each_online_cpu(cpu) { + rb_data[cpu].buffer = buffer; + rb_data[cpu].cpu = cpu; + rb_data[cpu].cnt = cpu; + rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], + "rbtester/%d", cpu); + if (WARN_ON(!rb_threads[cpu])) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + kthread_bind(rb_threads[cpu], cpu); + wake_up_process(rb_threads[cpu]); + } + + /* Now create the rb hammer! */ + rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); + if (WARN_ON(!rb_hammer)) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + ring_buffer_record_on(buffer); + /* + * Show buffer is enabled before setting rb_test_started. + * Yes there's a small race window where events could be + * dropped and the thread wont catch it. But when a ring + * buffer gets enabled, there will always be some kind of + * delay before other CPUs see it. Thus, we don't care about + * those dropped events. We care about events dropped after + * the threads see that the buffer is active. + */ + smp_wmb(); + rb_test_started = true; + + set_current_state(TASK_INTERRUPTIBLE); + /* Just run for 10 seconds */; + schedule_timeout(10 * HZ); + + kthread_stop(rb_hammer); + + out_free: + for_each_online_cpu(cpu) { + if (!rb_threads[cpu]) + break; + kthread_stop(rb_threads[cpu]); + } + if (ret) { + ring_buffer_free(buffer); + return ret; + } + + /* Report! */ + pr_info("finished\n"); + for_each_online_cpu(cpu) { + struct ring_buffer_event *event; + struct rb_test_data *data = &rb_data[cpu]; + struct rb_item *item; + unsigned long total_events; + unsigned long total_dropped; + unsigned long total_written; + unsigned long total_alloc; + unsigned long total_read = 0; + unsigned long total_size = 0; + unsigned long total_len = 0; + unsigned long total_lost = 0; + unsigned long lost; + int big_event_size; + int small_event_size; + + ret = -1; + + total_events = data->events + data->events_nested; + total_written = data->bytes_written + data->bytes_written_nested; + total_alloc = data->bytes_alloc + data->bytes_alloc_nested; + total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + + big_event_size = data->max_size + data->max_size_nested; + small_event_size = data->min_size + data->min_size_nested; + + pr_info("CPU %d:\n", cpu); + pr_info(" events: %ld\n", total_events); + pr_info(" dropped bytes: %ld\n", total_dropped); + pr_info(" alloced bytes: %ld\n", total_alloc); + pr_info(" written bytes: %ld\n", total_written); + pr_info(" biggest event: %d\n", big_event_size); + pr_info(" smallest event: %d\n", small_event_size); + + if (RB_WARN_ON(buffer, total_dropped)) + break; + + ret = 0; + + while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { + total_lost += lost; + item = ring_buffer_event_data(event); + total_len += ring_buffer_event_length(event); + total_size += item->size + sizeof(struct rb_item); + if (memcmp(&item->str[0], rb_string, item->size) != 0) { + pr_info("FAILED!\n"); + pr_info("buffer had: %.*s\n", item->size, item->str); + pr_info("expected: %.*s\n", item->size, rb_string); + RB_WARN_ON(buffer, 1); + ret = -1; + break; + } + total_read++; + } + if (ret) + break; + + ret = -1; + + pr_info(" read events: %ld\n", total_read); + pr_info(" lost events: %ld\n", total_lost); + pr_info(" total events: %ld\n", total_lost + total_read); + pr_info(" recorded len bytes: %ld\n", total_len); + pr_info(" recorded size bytes: %ld\n", total_size); + if (total_lost) + pr_info(" With dropped events, record len and size may not match\n" + " alloced and written from above\n"); + if (!total_lost) { + if (RB_WARN_ON(buffer, total_len != total_alloc || + total_size != total_written)) + break; + } + if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) + break; + + ret = 0; + } + if (!ret) + pr_info("Ring buffer PASSED!\n"); + + ring_buffer_free(buffer); + return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a5457d577b9..0434ff1b808 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -40,8 +40,8 @@ static int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); -static int producer_nice = 19; -static int consumer_nice = 19; +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE; static int producer_fifo = -1; static int consumer_fifo = -1; @@ -308,7 +308,7 @@ static void ring_buffer_producer(void) /* Let the user know that the test is running at low priority */ if (producer_fifo < 0 && consumer_fifo < 0 && - producer_nice == 19 && consumer_nice == 19) + producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); trace_printk("Time: %lld (usecs)\n", time); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5d2a0..291397e6666 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@ /* * ring buffer based function tracer * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: @@ -9,7 +9,7 @@ * * Based on code from the latency_tracer, that is: * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <generated/utsrelease.h> @@ -36,7 +36,9 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> +#include <linux/nmi.h> #include <linux/fs.h> +#include <linux/sched/rt.h> #include "trace.h" #include "trace_output.h" @@ -45,7 +47,7 @@ * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -int ring_buffer_expanded; +bool ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -71,12 +73,20 @@ static struct tracer_flags dummy_tracer_flags = { .opts = dummy_tracer_opt }; -static int dummy_set_flag(u32 old_flags, u32 bit, int set) +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } /* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_cmdline_save); + +/* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets @@ -86,18 +96,6 @@ static int tracing_disabled = 1; DEFINE_PER_CPU(int, ftrace_cpu_disabled); -static inline void ftrace_disable_cpu(void) -{ - preempt_disable(); - __this_cpu_inc(ftrace_cpu_disabled); -} - -static inline void ftrace_enable_cpu(void) -{ - __this_cpu_dec(ftrace_cpu_disabled); - preempt_enable(); -} - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -118,18 +116,23 @@ cpumask_var_t __read_mostly tracing_buffer_mask; enum ftrace_dump_mode ftrace_dump_on_oops; -static int tracing_set_tracer(const char *buf); +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + +static int tracing_set_tracer(struct trace_array *tr, const char *buf); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; +static bool allocate_snapshot; + static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -150,6 +153,46 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init stop_trace_on_warning(char *str) +{ + __disable_trace_on_warning = 1; + return 1; +} +__setup("traceoff_on_warning=", stop_trace_on_warning); + +static int __init boot_alloc_snapshot(char *str) +{ + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_options __initdata; + +static int __init set_trace_boot_options(char *str) +{ + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_boot_options = trace_boot_options_buf; + return 0; +} +__setup("trace_options=", set_trace_boot_options); + +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 0; +} +__setup("trace_clock=", set_trace_boot_clock); + + unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; @@ -171,58 +214,104 @@ unsigned long long ns2usecs(cycle_t nsec) */ static struct trace_array global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays); + +int trace_array_get(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret = -ENODEV; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + tr->ref++; + ret = 0; + break; + } + } + mutex_unlock(&trace_types_lock); + + return ret; +} -int filter_current_check_discard(struct ring_buffer *buffer, - struct ftrace_event_call *call, void *rec, - struct ring_buffer_event *event) +static void __trace_array_put(struct trace_array *this_tr) { - return filter_check_discard(call, rec, buffer, event); + WARN_ON(!this_tr->ref); + this_tr->ref--; } -EXPORT_SYMBOL_GPL(filter_current_check_discard); -cycle_t ftrace_now(int cpu) +void trace_array_put(struct trace_array *this_tr) +{ + mutex_lock(&trace_types_lock); + __trace_array_put(this_tr); + mutex_unlock(&trace_types_lock); +} + +int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + !filter_match_preds(file->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(call_filter_check_discard); + +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.buffer) + if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(buf->buffer, cpu); + ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); - -/* tracer_enabled is used to toggle activation of a tracer */ -static int tracer_enabled = 1; +cycle_t ftrace_now(int cpu) +{ + return buffer_ftrace_now(&global_trace.trace_buffer, cpu); +} /** - * tracing_is_enabled - return tracer_enabled status + * tracing_is_enabled - Show if global_trace has been disabled * - * This function is used by other tracers to know the status - * of the tracer_enabled flag. Tracers may use this function - * to know if it should enable their features when starting - * up. See irqsoff tracer for an example (start_irqsoff_tracer). + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. */ int tracing_is_enabled(void) { - return tracer_enabled; + /* + * For quick access (irqsoff uses this in fast path), just + * return the mirror variable of the state of the ring buffer. + * It's a little racy, but we don't really care. + */ + smp_rmb(); + return !global_trace.buffer_disabled; } /* @@ -242,13 +331,10 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; - /* * trace_types_lock is used to protect the trace_types list. */ -static DEFINE_MUTEX(trace_types_lock); +DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer @@ -278,13 +364,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ - /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ @@ -294,7 +380,7 @@ static inline void trace_access_lock(int cpu) static inline void trace_access_unlock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -332,41 +418,337 @@ static inline void trace_access_lock_init(void) #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); - /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); +static void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_on(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 0; + /* Make the flag seen by readers */ + smp_wmb(); +} -static void wakeup_work_handler(struct work_struct *work) +/** + * tracing_on - enable tracing buffers + * + * This function enables tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) { - wake_up(&trace_wait); + tracer_tracing_on(&global_trace); } +EXPORT_SYMBOL_GPL(tracing_on); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + int alloc; + int pc; -static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + pc = preempt_count(); + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); + + return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct bputs_entry *entry; + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + int pc; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + pc = preempt_count(); + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT /** - * trace_wake_up - wake up tasks waiting for trace input + * trace_snapshot - take a snapshot of the current buffer. * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. */ -void trace_wake_up(void) +void tracing_snapshot(void) { - const unsigned long delay = msecs_to_jiffies(2); + struct trace_array *tr = &global_trace; + struct tracer *tracer = tr->current_trace; + unsigned long flags; - if (trace_flags & TRACE_ITER_BLOCK) + if (in_nmi()) { + internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + internal_trace_puts("*** snapshot is being ignored ***\n"); return; - schedule_delayed_work(&wakeup_work, delay); + } + + if (!tr->allocated_snapshot) { + internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); + internal_trace_puts("*** stopping trace here! ***\n"); + tracing_off(); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer->use_max_tr) { + internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); + internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id()); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; } +static void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); + tr->allocated_snapshot = false; +} + +/** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + int ret; + + ret = tracing_alloc_snapshot(); + if (ret < 0) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); +void tracing_snapshot_alloc(void) +{ + /* Give warning */ + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void tracer_tracing_off(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_off(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 1; + /* Make the flag seen by readers */ + smp_wmb(); +} + +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + tracer_tracing_off(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_off); + +void disable_trace_on_warning(void) +{ + if (__disable_trace_on_warning) + tracing_off(); +} + +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +static int tracer_tracing_is_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + return ring_buffer_record_is_on(tr->trace_buffer.buffer); + return !tr->buffer_disabled; +} + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + return tracer_tracing_is_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_is_on); + static int __init set_buf_size(char *str) { unsigned long buf_size; @@ -384,15 +766,15 @@ __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { - unsigned long threshhold; + unsigned long threshold; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &threshhold); + ret = kstrtoul(str, 0, &threshold); if (ret < 0) return 0; - tracing_thresh = threshhold * 1000; + tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); @@ -428,20 +810,24 @@ static const char *trace_options[] = { "overwrite", "disable_on_free", "irq-info", + "markers", + "function-trace", NULL }; static struct { u64 (*func)(void); const char *name; + int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local" }, - { trace_clock_global, "global" }, - { trace_clock_counter, "counter" }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + ARCH_TRACE_CLOCKS }; -int trace_clock_id; - /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -536,9 +922,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, if (isspace(ch)) { parser->buffer[parser->idx] = 0; parser->cont = false; - } else { + } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + } else { + ret = -EINVAL; + goto out; } *ppos += read; @@ -575,7 +964,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - void *ret; if (s->len <= s->readpos) return -EBUSY; @@ -583,35 +971,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) len = s->len - s->readpos; if (cnt > len) cnt = len; - ret = memcpy(buf, s->buffer + s->readpos, cnt); - if (!ret) - return -EFAULT; + memcpy(buf, s->buffer + s->readpos, cnt); s->readpos += cnt; return cnt; } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; - /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -620,20 +988,29 @@ unsigned long __read_mostly tracing_max_latency; static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data; + struct trace_buffer *trace_buf = &tr->trace_buffer; + struct trace_buffer *max_buf = &tr->max_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; - max_data = max_tr.data[cpu]; - max_data->saved_latency = tracing_max_latency; + max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; - max_data->uid = task_uid(tsk); + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; @@ -654,23 +1031,27 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } - arch_spin_lock(&ftrace_max_lock); - tr->buffer = max_tr.buffer; - max_tr.buffer = buf; + arch_spin_lock(&tr->max_lock); + + buf = tr->trace_buffer.buffer; + tr->trace_buffer.buffer = tr->max_buffer.buffer; + tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } /** @@ -686,20 +1067,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); - ftrace_disable_cpu(); - - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -708,19 +1088,92 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * the max trace buffer (no one writes directly to it) * and flag that it failed. */ - trace_array_printk(&max_tr, _THIS_IP_, + trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit in progress\n"); } - ftrace_enable_cpu(); - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ +static int wait_on_pipe(struct trace_iterator *iter) +{ + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return 0; + + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ + struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; + int ret; + + if (!type->selftest || tracing_selftest_disabled) + return 0; + + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + tracing_reset_online_cpus(&tr->trace_buffer); + + tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + tr->allocated_snapshot = true; + } +#endif + + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + tr->current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); + return -1; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + tr->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, 1, + RING_BUFFER_ALL_CPUS); + } +#endif + + printk(KERN_CONT "PASSED\n"); + return 0; +} +#else +static inline int run_tracer_selftest(struct tracer *type) +{ + return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer @@ -728,8 +1181,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * Register a new plugin tracer. */ int register_tracer(struct tracer *type) -__releases(kernel_lock) -__acquires(kernel_lock) { struct tracer *t; int ret = 0; @@ -765,49 +1216,10 @@ __acquires(kernel_lock) else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; - - -#ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest && !tracing_selftest_disabled) { - struct tracer *saved_tracer = current_trace; - struct trace_array *tr = &global_trace; - /* - * Run a selftest on this tracer. - * Here we reset the trace buffer, and set the current - * tracer to be this tracer. The tracer can then run some - * internal tracing to verify that everything is in order. - * If we fail, we do not register this tracer. - */ - tracing_reset_online_cpus(tr); - - current_trace = type; - - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size); - - /* the test is responsible for initializing and enabling */ - pr_info("Testing tracer %s: ", type->name); - ret = type->selftest(type, tr); - /* the test is responsible for resetting too */ - current_trace = saved_tracer; - if (ret) { - printk(KERN_CONT "FAILED!\n"); - goto out; - } - /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(tr); - - /* Shrink the max buffer again */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1); - - printk(KERN_CONT "PASSED\n"); - } -#endif + ret = run_tracer_selftest(type); + if (ret < 0) + goto out; type->next = trace_types; trace_types = type; @@ -824,10 +1236,10 @@ __acquires(kernel_lock) printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); + tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = 1; + tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", type->name); @@ -837,116 +1249,126 @@ __acquires(kernel_lock) return ret; } -void unregister_tracer(struct tracer *type) +void tracing_reset(struct trace_buffer *buf, int cpu) { - struct tracer **t; + struct ring_buffer *buffer = buf->buffer; - mutex_lock(&trace_types_lock); - for (t = &trace_types; *t; t = &(*t)->next) { - if (*t == type) - goto found; - } - pr_info("Tracer %s not registered\n", type->name); - goto out; - - found: - *t = (*t)->next; - - if (type == current_trace && tracer_enabled) { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(&global_trace); - current_trace = &nop_trace; - } -out: - mutex_unlock(&trace_types_lock); -} - -static void __tracing_reset(struct ring_buffer *buffer, int cpu) -{ - ftrace_disable_cpu(); - ring_buffer_reset_cpu(buffer, cpu); - ftrace_enable_cpu(); -} - -void tracing_reset(struct trace_array *tr, int cpu) -{ - struct ring_buffer *buffer = tr->buffer; + if (!buffer) + return; ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; int cpu; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ synchronize_sched(); - tr->time_start = ftrace_now(tr->cpu); + buf->time_start = buffer_ftrace_now(buf, buf->cpu); for_each_online_cpu(cpu) - __tracing_reset(buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } -void tracing_reset_current(int cpu) +/* Must have trace_types_lock held */ +void tracing_reset_all_online_cpus(void) { - tracing_reset(&global_trace, cpu); -} + struct trace_array *tr; -void tracing_reset_current_online_cpus(void) -{ - tracing_reset_online_cpus(&global_trace); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE + tracing_reset_online_cpus(&tr->max_buffer); +#endif + } } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx) { - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; } -int is_tracing_stopped(void) +static inline void set_cmdline(int idx, const char *cmdline) { - return trace_stop_count; + memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } -/** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected. This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) +{ + int ret; + + savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; +} + +int is_tracing_stopped(void) { - tracing_disabled = 1; - ftrace_stop(); - tracing_off_permanent(); + return global_trace.stop_count; } /** @@ -963,32 +1385,64 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) { - if (trace_stop_count < 0) { + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (--global_trace.stop_count) { + if (global_trace.stop_count < 0) { /* Someone screwed up their debugging */ WARN_ON_ONCE(1); - trace_stop_count = 0; + global_trace.stop_count = 0; } goto out; } /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); +#endif + + arch_spin_unlock(&global_trace.max_lock); + + out: + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; - arch_spin_unlock(&ftrace_max_lock); + /* If global, we need to also start the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_start(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + + if (--tr->stop_count) { + if (tr->stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + tr->stop_count = 0; + } + goto out; + } + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); - ftrace_start(); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -1002,36 +1456,58 @@ void tracing_stop(void) struct ring_buffer *buffer; unsigned long flags; - ftrace_stop(); - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (trace_stop_count++) + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (global_trace.stop_count++) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); +#endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + /* If global, we need to also stop the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_stop(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) + goto out; + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; + return 0; /* * It's not the end of the world if we don't get @@ -1040,11 +1516,11 @@ static void trace_save_cmdline(struct task_struct *tsk) * so if we miss here, then better luck next time. */ if (!arch_spin_trylock(&trace_cmdline_lock)) - return; + return 0; - idx = map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; /* * Check whether the cmdline buffer at idx has a pid @@ -1052,22 +1528,24 @@ static void trace_save_cmdline(struct task_struct *tsk) * need to clear the map_pid_to_cmdline. Otherwise we * would read the new comm for the old pid. */ - pid = map_cmdline_to_pid[idx]; + pid = savedcmd->map_cmdline_to_pid[idx]; if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - cmdline_idx = idx; + savedcmd->cmdline_idx = idx; } - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); + + return 1; } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; @@ -1086,13 +1564,19 @@ void trace_find_cmdline(int pid, char comm[]) return; } - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - map = map_pid_to_cmdline[pid]; + map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); + strcpy(comm, get_saved_cmdlines(map)); else strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -1100,11 +1584,14 @@ void trace_find_cmdline(int pid, char comm[]) void tracing_record_cmdline(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || - !tracing_is_on()) + if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) return; - trace_save_cmdline(tsk); + if (!__this_cpu_read(trace_cmdline_save)) + return; + + if (trace_save_cmdline(tsk)) + __this_cpu_write(trace_cmdline_save, false); } void @@ -1115,7 +1602,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1124,7 +1610,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1147,34 +1634,66 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + ring_buffer_unlock_commit(buffer, event); +} + static inline void __trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) + unsigned long flags, int pc) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); ftrace_trace_userstack(buffer, flags, pc); - - if (wake) - trace_wake_up(); } void trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); + +static struct ring_buffer *temp_buffer; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *entry; + + *current_rb = ftrace_file->tr->trace_buffer.buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the tigger to use. It's recusive + * safe and will not be recorded anywhere. + */ + if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + } + return entry; } +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, int type, unsigned long len, unsigned long flags, int pc) { - *current_rb = global_trace.buffer; + *current_rb = global_trace.trace_buffer.buffer; return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } @@ -1184,29 +1703,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc); } EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); -} -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); - -void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs) +void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs) { - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); +EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) @@ -1221,7 +1732,7 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -1237,17 +1748,8 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); -} - -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) -{ - if (likely(!atomic_read(&data->disabled))) - trace_function(tr, ip, parent_ip, flags, pc); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE @@ -1282,7 +1784,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = ++__get_cpu_var(ftrace_stack_reserve); + use_stack = __this_cpu_inc_return(ftrace_stack_reserve); /* * We don't need any atomic variables, just a barrier. * If an interrupt comes in, we don't care, because it would @@ -1292,7 +1794,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ barrier(); if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES; if (regs) @@ -1330,13 +1832,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ barrier(); - __get_cpu_var(ftrace_stack_reserve)--; + __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } @@ -1362,13 +1864,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); } /** * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers) */ -void trace_dump_stack(void) +void trace_dump_stack(int skip) { unsigned long flags; @@ -1377,8 +1880,13 @@ void trace_dump_stack(void) local_save_flags(flags); - /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); + /* + * Skip 3 more, seems to get us at the caller of + * this function. + */ + skip += 3; + __ftrace_trace_stack(global_trace.trace_buffer.buffer, + flags, skip, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); @@ -1426,8 +1934,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -1444,25 +1952,161 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + char buffer[TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct *trace_percpu_sirq_buffer; +static struct trace_buffer_struct *trace_percpu_irq_buffer; +static struct trace_buffer_struct *trace_percpu_nmi_buffer; + +/* + * The buffer used is dependent on the context. There is a per cpu + * buffer for normal context, softirq contex, hard irq context and + * for NMI context. Thise allows for lockless recording. + * + * Note, if the buffers failed to be allocated, then this returns NULL + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *percpu_buffer; + + /* + * If we have allocated per cpu buffers, then we do not + * need to do any locking. + */ + if (in_nmi()) + percpu_buffer = trace_percpu_nmi_buffer; + else if (in_irq()) + percpu_buffer = trace_percpu_irq_buffer; + else if (in_softirq()) + percpu_buffer = trace_percpu_sirq_buffer; + else + percpu_buffer = trace_percpu_buffer; + + if (!percpu_buffer) + return NULL; + + return this_cpu_ptr(&percpu_buffer->buffer[0]); +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct *buffers; + struct trace_buffer_struct *sirq_buffers; + struct trace_buffer_struct *irq_buffers; + struct trace_buffer_struct *nmi_buffers; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (!buffers) + goto err_warn; + + sirq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!sirq_buffers) + goto err_sirq; + + irq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!irq_buffers) + goto err_irq; + + nmi_buffers = alloc_percpu(struct trace_buffer_struct); + if (!nmi_buffers) + goto err_nmi; + + trace_percpu_buffer = buffers; + trace_percpu_sirq_buffer = sirq_buffers; + trace_percpu_irq_buffer = irq_buffers; + trace_percpu_nmi_buffer = nmi_buffers; + + return 0; + + err_nmi: + free_percpu(irq_buffers); + err_irq: + free_percpu(sirq_buffers); + err_sirq: + free_percpu(buffers); + err_warn: + WARN(1, "Could not allocate percpu trace_printk buffer"); + return -ENOMEM; +} + +static int buffers_allocated; + +void trace_printk_init_buffers(void) +{ + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warning("\n**********************************************************\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("** **\n"); + pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warning("** **\n"); + pr_warning("** This means that this is a DEBUG kernel and it is **\n"); + pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** **\n"); + pr_warning("** If you see this message and you are not debugging **\n"); + pr_warning("** the kernel, report this immediately to your vendor! **\n"); + pr_warning("** **\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("**********************************************************\n"); + + /* Expand the buffers to set size */ + tracing_update_buffers(); + + buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. If the global_trace.buffer is already + * allocated here, then this was called by module code. + */ + if (global_trace.trace_buffer.buffer) + tracing_start_cmdline_record(); +} + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +static void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); +} + /** * trace_vbprintk - write binary msg to tracing buffer * */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - static u32 trace_buf[TRACE_BUF_SIZE]; - struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; - int disable; - int cpu, len = 0, size, pc; + char *tbuffer; + int len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -1472,43 +2116,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pc = preempt_count(); preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; goto out; + } - /* Lockdep uses trace_printk for lock tracing */ - local_irq_save(flags); - arch_spin_lock(&trace_buf_lock); - len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); - if (len > TRACE_BUF_SIZE || len < 0) - goto out_unlock; + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out; + local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) - goto out_unlock; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; - memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } -out_unlock: - arch_spin_unlock(&trace_buf_lock); - local_irq_restore(flags); - out: - atomic_dec_return(&data->disabled); preempt_enable_notrace(); unpause_graph_tracing(); @@ -1516,80 +2153,95 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} - -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; - static char trace_buf[TRACE_BUF_SIZE]; - struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_array_cpu *data; - int cpu, len = 0, size, pc; + int len = 0, size, pc; struct print_entry *entry; - unsigned long irq_flags; - int disable; + unsigned long flags; + char *tbuffer; if (tracing_disabled || tracing_selftest_running) return 0; + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + pc = preempt_count(); preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; goto out; + } - pause_graph_tracing(); - raw_local_irq_save(irq_flags); - arch_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + if (len > TRACE_BUF_SIZE) + goto out; + local_save_flags(flags); size = sizeof(*entry) + len + 1; - buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, pc); + flags, pc); if (!event) - goto out_unlock; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; - memcpy(&entry->buf, trace_buf, len); + memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) { - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 6, pc); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); } - - out_unlock: - arch_spin_unlock(&trace_buf_lock); - raw_local_irq_restore(irq_flags); - unpause_graph_tracing(); out: - atomic_dec_return(&data->disabled); preempt_enable_notrace(); + unpause_graph_tracing(); return len; } +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1598,14 +2250,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - - ftrace_enable_cpu(); + if (buf_iter) + ring_buffer_read(buf_iter, NULL); } static struct trace_entry * @@ -1613,19 +2262,14 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, lost_events); - ftrace_enable_cpu(); - if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); @@ -1638,19 +2282,20 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->tr->buffer; + struct ring_buffer *buffer = iter->trace_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; + int next_size = 0; int cpu; /* * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ - if (cpu_file > TRACE_PIPE_ALL_CPU) { + if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1675,9 +2320,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, next_cpu = cpu; next_ts = ts; next_lost = lost_events; + next_size = iter->ent_size; } } + iter->ent_size = next_size; + if (ent_cpu) *ent_cpu = next_cpu; @@ -1711,11 +2359,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); - ftrace_enable_cpu(); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1747,18 +2392,17 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct trace_array *tr = iter->tr; struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; - tr->data[cpu]->skipped_entries = 0; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; - if (!iter->buffer_iter[cpu]) + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) return; - buf_iter = iter->buffer_iter[cpu]; ring_buffer_iter_reset(buf_iter); /* @@ -1767,13 +2411,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * by the timestamp being before the start of the buffer. */ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->tr->time_start) + if (ts >= iter->trace_buffer->time_start) break; entries++; ring_buffer_read(buf_iter, NULL); } - tr->data[cpu]->skipped_entries = entries; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; } /* @@ -1783,37 +2427,42 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; - static struct tracer *old_tracer; + struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; - /* copy the tracer to avoid using a global lock all around */ + /* + * copy the tracer to avoid using a global lock all around. + * iter->trace is a copy of current_trace, the pointer to the + * name may be used instead of a strcmp(), as iter->trace->name + * will point to the same string as current_trace->name. + */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; - *iter->trace = *current_trace; - } + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); - atomic_inc(&trace_record_cmdline_disabled); +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); +#endif + + if (!iter->snapshot) + atomic_inc(&trace_record_cmdline_disabled); if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; iter->idx = -1; - ftrace_disable_cpu(); - - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); - ftrace_enable_cpu(); - iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -1840,13 +2489,21 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->trace->use_max_tr) + return; +#endif + + if (!iter->snapshot) + atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, + unsigned long *total, unsigned long *entries) { unsigned long count; int cpu; @@ -1855,19 +2512,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); + count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total += count; } else *total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries += count; } } @@ -1884,27 +2541,27 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# _-----=> irqs-off\n"); seq_puts(m, "# / _----=> need-resched\n"); seq_puts(m, "# | / _---=> hardirq/softirq\n"); @@ -1918,17 +2575,16 @@ void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[tr->cpu]; - struct tracer *type = current_trace; + struct trace_buffer *buf = iter->trace_buffer; + struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); + struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = "preemption"; - if (type) - name = type->name; + name = type->name; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1939,7 +2595,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) nsecs_to_usecs(data->saved_latency), entries, total, - tr->cpu, + buf->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -1959,7 +2615,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "# -----------------\n"); seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", - data->comm, data->pid, data->uid, data->nice, + data->comm, data->pid, + from_kuid_munged(seq_user_ns(m), data->uid), data->nice, data->policy, data->rt_priority); seq_puts(m, "# -----------------\n"); @@ -1989,7 +2646,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; - if (iter->tr->data[iter->cpu]->skipped_entries) + if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; cpumask_set_cpu(iter->cpu, iter->started); @@ -2108,27 +2765,30 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) int trace_empty(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ - if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } return 1; } for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } } @@ -2152,6 +2812,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_BPUTS && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bputs_msg_only(iter); + if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2206,9 +2871,9 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->tr, m); + print_func_help_header_irq(iter->trace_buffer, m); else - print_func_help_header(iter->tr, m); + print_func_help_header(iter->trace_buffer, m); } } } @@ -2221,6 +2886,50 @@ static void test_ftrace_alive(struct seq_file *m) seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); } +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer.\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); + seq_printf(m, "# Must use main snapshot file to allocate.\n"); +#endif + seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_printf(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2232,7 +2941,9 @@ static int s_show(struct seq_file *m, void *v) seq_puts(m, "#\n"); test_ftrace_alive(m); } - if (iter->trace && iter->trace->print_header) + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); @@ -2263,6 +2974,17 @@ static int s_show(struct seq_file *m, void *v) return 0; } +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} + static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -2271,21 +2993,24 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { - long cpu_file = (long) inode->i_private; - void *fail_ret = ERR_PTR(-ENOMEM); + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - struct seq_file *m; - int cpu, ret; + int cpu; if (tracing_disabled) return ERR_PTR(-ENODEV); - iter = kzalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); if (!iter) return ERR_PTR(-ENOMEM); + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2295,35 +3020,45 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter->trace) goto fail; - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace && current_trace->print_max) - iter->tr = &max_tr; + iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE + /* Currently only the top directory has a snapshot */ + if (tr->current_trace->print_max || snapshot) + iter->trace_buffer = &tr->max_buffer; else - iter->tr = &global_trace; +#endif + iter->trace_buffer = &tr->trace_buffer; + iter->snapshot = snapshot; iter->pos = -1; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); - iter->cpu_file = cpu_file; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->tr->buffer)) + if (ring_buffer_overruns(iter->trace_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; - /* stop the trace while dumping */ - tracing_stop(); + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* stop the trace while dumping if we are not opening "snapshot" */ + if (!iter->snapshot) + tracing_stop_tr(tr); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -2333,38 +3068,23 @@ __tracing_open(struct inode *inode, struct file *file) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } - ret = seq_open(file, &tracer_seq_ops); - if (ret < 0) { - fail_ret = ERR_PTR(ret); - goto fail_buffer; - } - - m = file->private_data; - m->private = iter; - mutex_unlock(&trace_types_lock); return iter; - fail_buffer: - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } - free_cpumask_var(iter->started); - tracing_start(); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); - kfree(iter); - - return fail_ret; + kfree(iter->buffer_iter); +release: + seq_release_private(inode, file); + return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -2376,18 +3096,46 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; +} + static int tracing_release(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; - if (!(file->f_mode & FMODE_READ)) + if (!(file->f_mode & FMODE_READ)) { + trace_array_put(tr); return 0; + } + /* Writes do not use seq_file */ iter = m->private; - mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2396,65 +3144,119 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - /* reenable tracing if it was previously enabled */ - tracing_start(); + if (!iter->snapshot) + /* reenable tracing if it was previously enabled */ + tracing_start_tr(tr); + + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); - seq_release(inode, file); mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); - kfree(iter); + kfree(iter->buffer_iter); + seq_release_private(inode, file); + return 0; } +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return 0; +} + +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return single_release(inode, file); +} + static int tracing_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* If this file was open for write, then erase contents */ - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - long cpu = (long) inode->i_private; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + int cpu = tracing_get_cpu(inode); - if (cpu == TRACE_PIPE_ALL_CPU) - tracing_reset_online_cpus(&global_trace); + if (cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->trace_buffer); else - tracing_reset(&global_trace, cpu); + tracing_reset(&tr->trace_buffer, cpu); } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } + + if (ret < 0) + trace_array_put(tr); + return ret; } +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + static void * t_next(struct seq_file *m, void *v, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) - t = t->next; + t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) - ; + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; return t; } @@ -2489,10 +3291,21 @@ static const struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + if (tracing_disabled) return -ENODEV; - return seq_open(file, &show_traces_seq_ops); + ret = seq_open(file, &show_traces_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = tr; + + return 0; } static ssize_t @@ -2502,19 +3315,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { + int ret; + if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else - return 0; + file->f_pos = ret = 0; + + return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_release, }; @@ -2526,11 +3343,6 @@ static const struct file_operations show_traces_fops = { }; /* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_var_t tracing_cpumask; - -/* * The tracer itself will not take this lock, but still we want * to provide a consistent cpumask to user-space: */ @@ -2546,11 +3358,12 @@ static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { + struct trace_array *tr = file_inode(filp)->i_private; int len; mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -2568,8 +3381,9 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - int err, cpu; + struct trace_array *tr = file_inode(filp)->i_private; cpumask_var_t tracing_cpumask_new; + int err, cpu; if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -2581,25 +3395,27 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpumask_test_cpu(cpu, tracing_cpumask) && + if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&global_trace.data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); } - if (!cpumask_test_cpu(cpu, tracing_cpumask) && + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&global_trace.data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); local_irq_enable(); - cpumask_copy(tracing_cpumask, tracing_cpumask_new); + cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); @@ -2613,21 +3429,23 @@ err_unlock: } static const struct file_operations tracing_cpumask_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; + struct trace_array *tr = m->private; u32 tracer_flags; int i; mutex_lock(&trace_types_lock); - tracer_flags = current_trace->flags->val; - trace_opts = current_trace->flags->opts; + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) @@ -2647,13 +3465,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) return 0; } -static int __set_tracer_option(struct tracer *trace, +static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { + struct tracer *trace = tr->current_trace; int ret; - ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -2665,8 +3484,9 @@ static int __set_tracer_option(struct tracer *trace, } /* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { + struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; @@ -2675,18 +3495,31 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(trace, trace->flags, - opts, neg); + return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (tr->current_trace->flag_changed) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@ -2696,49 +3529,71 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) - ring_buffer_change_overwrite(global_trace.buffer, enabled); + if (mask == TRACE_ITER_OVERWRITE) { + ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif + } + + if (mask == TRACE_ITER_PRINTK) + trace_printk_start_stop_comm(enabled); + + return 0; } -static ssize_t -tracing_trace_options_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int trace_set_options(struct trace_array *tr, char *option) { - char buf[64]; char *cmp; int neg = 0; - int ret; + int ret = -ENODEV; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + cmp = strstrip(option); if (strncmp(cmp, "no", 2) == 0) { neg = 1; cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(tr, 1 << i, !neg); break; } } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); - ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - } + if (!trace_options[i]) + ret = set_tracer_option(tr, cmp, neg); + + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = trace_set_options(tr, buf); + if (ret < 0) + return ret; *ppos += cnt; @@ -2747,35 +3602,156 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + int ret; + if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_trace_options_show, NULL); + + if (trace_array_get(tr) < 0) + return -ENODEV; + + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; static const char readme_msg[] = "tracing mini-HOWTO:\n\n" - "# mount -t debugfs nodev /sys/kernel/debug\n\n" - "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "nop\n" - "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "sched_switch\n" - "# cat /sys/kernel/debug/tracing/trace_options\n" - "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" - "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" + "# echo 0 > tracing_on : quick way to disable tracing\n" + "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" + " Important files:\n" + " trace\t\t\t- The static contents of the buffer\n" + "\t\t\t To clear the buffer write into this file: echo > trace\n" + " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" + " current_tracer\t- function and latency tracers\n" + " available_tracers\t- list of configured tracers for current_tracer\n" + " buffer_size_kb\t- view and modify size of per cpu buffer\n" + " buffer_total_size_kb - view total size of all cpu buffers\n\n" + " trace_clock\t\t-change the clock used to order events\n" + " local: Per cpu clock but may not be synced across CPUs\n" + " global: Synced across CPUs but slows tracing down.\n" + " counter: Not a clock, but just an increment\n" + " uptime: Jiffy counter from time of boot\n" + " perf: Same clock that perf events use\n" +#ifdef CONFIG_X86_64 + " x86-tsc: TSC cycle counter\n" +#endif + "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + " tracing_cpumask\t- Limit which CPUs to trace\n" + " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" + "\t\t\t Remove sub-buffer with rmdir\n" + " trace_options\t\t- Set format or modify how tracing happens\n" + "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" +#ifdef CONFIG_DYNAMIC_FTRACE + "\n available_filter_functions - list of functions that can be filtered on\n" + " set_ftrace_filter\t- echo function name in here to only trace these\n" + "\t\t\t functions\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module\n" + "\t Format: :mod:<module-name>\n" + "\t example: echo :mod:ext3 > set_ftrace_filter\n" + "\t triggers: a command to perform when function is hit\n" + "\t Format: <function>:<trigger>[:count]\n" + "\t trigger: traceon, traceoff\n" + "\t\t enable_event:<system>:<event>\n" + "\t\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t\t dump\n" + "\t\t cpudump\n" + "\t example: echo do_fault:traceoff > set_ftrace_filter\n" + "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" + "\t The first one will disable tracing every time do_fault is hit\n" + "\t The second will disable tracing at most 3 times when do_trap is hit\n" + "\t The first time do trap is hit and it disables tracing, the\n" + "\t counter will decrement to 2. If tracing is already disabled,\n" + "\t the counter will not decrement. It only decrements when the\n" + "\t trigger did work\n" + "\t To remove trigger without count:\n" + "\t echo '!<function>:<trigger> > set_ftrace_filter\n" + "\t To remove trigger with a count:\n" + "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + " set_ftrace_notrace\t- echo function name in here to never trace.\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module command :mod:\n" + "\t Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER + " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" + "\t\t (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" + "\t\t\t snapshot buffer. Read the contents for more\n" + "\t\t\t information\n" +#endif +#ifdef CONFIG_STACK_TRACER + " stack_trace\t\t- Shows the max stack trace when active\n" + " stack_max_size\t- Shows current max stack size that was traced\n" + "\t\t\t Write into this file to reset the max size (trigger a\n" + "\t\t\t new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" + "\t\t\t traces\n" +#endif +#endif /* CONFIG_STACK_TRACER */ + " events/\t\t- Directory containing all trace event subsystems:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" + " events/<system>/\t- Directory containing all trace events for <system>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" + "\t\t\t events\n" + " filter\t\t- If set, only events passing filter are traced\n" + " events/<system>/<event>/\t- Directory containing control files for\n" + "\t\t\t <event>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" + " filter\t\t- If set, only events passing filter are traced\n" + " trigger\t\t- If set, a command to perform when event is hit\n" + "\t Format: <trigger>[:count][if <filter>]\n" + "\t trigger: traceon, traceoff\n" + "\t enable_event:<system>:<event>\n" + "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t example: echo traceoff > events/block/block_unplug/trigger\n" + "\t echo traceoff:3 > events/block/block_unplug/trigger\n" + "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" + "\t events/block/block_unplug/trigger\n" + "\t The first disables tracing every time block_unplug is hit.\n" + "\t The second disables tracing the first 3 times block_unplug is hit.\n" + "\t The third enables the kmalloc event the first 3 times block_unplug\n" + "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" + "\t Like function triggers, the counter is only decremented if it\n" + "\t enabled or disabled tracing.\n" + "\t To remove a trigger without a count:\n" + "\t echo '!<trigger> > <system>/<event>/trigger\n" + "\t To remove a trigger with a count:\n" + "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" + "\t Filters can be ignored when removing a trigger.\n" ; static ssize_t @@ -2792,73 +3768,129 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; + unsigned int *ptr = v; - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; + if (*pos || m->count) + ptr++; - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); - return -ENOMEM; + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; } - buf = file_buf; + return NULL; +} - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; } - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + return v; +} - kfree(file_buf); - kfree(buf_comm); +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} - return len; +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static ssize_t -tracing_ctrl_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) { char buf[64]; int r; - r = sprintf(buf, "%u\n", tracer_enabled); + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); + return -ENOMEM; + } + + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + static ssize_t -tracing_ctrl_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; unsigned long val; int ret; @@ -2866,45 +3898,35 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, if (ret) return ret; - val = !!val; - - mutex_lock(&trace_types_lock); - if (tracer_enabled ^ val) { - - /* Only need to warn if this is used to change the state */ - WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; - if (val) { - tracer_enabled = 1; - if (current_trace->start) - current_trace->start(tr); - tracing_start(); - } else { - tracer_enabled = 0; - tracing_stop(); - if (current_trace->stop) - current_trace->stop(tr); - } - } - mutex_unlock(&trace_types_lock); + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; *ppos += cnt; return cnt; } +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; + static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); + r = sprintf(buf, "%s\n", tr->current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -2912,11 +3934,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); return t->init(tr); } -static int __tracing_resize_ring_buffer(unsigned long size) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) +{ + int cpu; + + for_each_tracing_cpu(cpu) + per_cpu_ptr(buf->data, cpu)->entries = val; +} + +#ifdef CONFIG_TRACER_MAX_TRACE +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); + if (ret < 0) + break; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; + } + } else { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); + if (ret == 0) + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; + } + + return ret; +} +#endif /* CONFIG_TRACER_MAX_TRACE */ + +static int __tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu) { int ret; @@ -2925,21 +3984,25 @@ static int __tracing_resize_ring_buffer(unsigned long size) * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; - ret = ring_buffer_resize(global_trace.buffer, size); + /* May be called before buffers are initialized */ + if (!tr->trace_buffer.buffer) + return 0; + + ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); if (ret < 0) return ret; - if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || + !tr->current_trace->use_max_tr) goto out; - ret = ring_buffer_resize(max_tr.buffer, size); + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { - int r; - - r = ring_buffer_resize(global_trace.buffer, - global_trace.entries); + int r = resize_buffer_duplicate_size(&tr->trace_buffer, + &tr->trace_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -2961,43 +4024,42 @@ static int __tracing_resize_ring_buffer(unsigned long size) return ret; } - max_tr.entries = size; + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&tr->max_buffer, size); + else + per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; + out: - global_trace.entries = size; +#endif /* CONFIG_TRACER_MAX_TRACE */ + + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&tr->trace_buffer, size); + else + per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; return ret; } -static ssize_t tracing_resize_ring_buffer(unsigned long size) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) { - int cpu, ret = size; + int ret = size; mutex_lock(&trace_types_lock); - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); + if (cpu_id != RING_BUFFER_ALL_CPUS) { + /* make sure, this cpu is enabled in the mask */ + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { + ret = -EINVAL; + goto out; + } } - if (size != global_trace.entries) - ret = __tracing_resize_ring_buffer(size); - + ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } - - tracing_start(); +out: mutex_unlock(&trace_types_lock); return ret; @@ -3020,7 +4082,8 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, + RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); return ret; @@ -3029,22 +4092,42 @@ int tracing_update_buffers(void) struct trace_option_dentry; static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); -static int tracing_set_tracer(const char *buf) +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + tr->current_trace = &nop_trace; +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) { static struct trace_option_dentry *topts; - struct trace_array *tr = &global_trace; struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE + bool had_max_tr; +#endif int ret = 0; mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, + RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; ret = 0; @@ -3058,32 +4141,53 @@ static int tracing_set_tracer(const char *buf) ret = -EINVAL; goto out; } - if (t == current_trace) + if (t == tr->current_trace) goto out; + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(t, tr)) { + ret = -EINVAL; + goto out; + } + trace_branch_disable(); - if (current_trace && current_trace->reset) - current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + /* Current trace needs to be nop_trace before synchronize_sched */ + tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->allocated_snapshot; + + if (had_max_tr && !t->use_max_tr) { /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. */ - ring_buffer_resize(max_tr.buffer, 1); - max_tr.entries = 1; + synchronize_sched(); + free_snapshot(tr); + } +#endif + /* Currently, only the top instance has options */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); } - destroy_trace_option_files(topts); - - current_trace = t; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { - ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); +#ifdef CONFIG_TRACER_MAX_TRACE + if (t->use_max_tr && !had_max_tr) { + ret = alloc_snapshot(tr); if (ret < 0) goto out; - max_tr.entries = global_trace.entries; } +#endif if (t->init) { ret = tracer_init(t, tr); @@ -3091,6 +4195,8 @@ static int tracing_set_tracer(const char *buf) goto out; } + tr->current_trace = t; + tr->current_trace->enabled++; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3102,6 +4208,7 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; @@ -3121,7 +4228,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) buf[i] = 0; - err = tracing_set_tracer(buf); + err = tracing_set_tracer(tr, buf); if (err) return err; @@ -3164,19 +4271,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, static int tracing_open_pipe(struct inode *inode, struct file *filp) { - long cpu_file = (long) inode->i_private; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret = 0; if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + mutex_lock(&trace_types_lock); /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; + __trace_array_put(tr); goto out; } @@ -3189,8 +4300,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3203,8 +4313,13 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; - iter->cpu_file = cpu_file; - iter->tr = &global_trace; + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + iter->tr = tr; + iter->trace_buffer = &tr->trace_buffer; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); filp->private_data = iter; @@ -3219,6 +4334,7 @@ out: fail: kfree(iter->trace); kfree(iter); + __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; } @@ -3226,6 +4342,7 @@ fail: static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + struct trace_array *tr = inode->i_private; mutex_lock(&trace_types_lock); @@ -3239,66 +4356,41 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter); + trace_array_put(tr); + return 0; } static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { - struct trace_iterator *iter = filp->private_data; + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return POLLIN | POLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) { + if (trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return POLLIN | POLLRDNORM; - } else { - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - poll_wait(filp, &trace_wait, poll_table); - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - - return 0; - } + else + return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, + filp, poll_table); } - -void default_wait_pipe(struct trace_iterator *iter) +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) { - DEFINE_WAIT(wait); - - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); - - if (trace_empty(iter)) - schedule(); - - finish_wait(&trace_wait, &wait); -} + struct trace_iterator *iter = filp->private_data; -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); + return trace_poll(iter, filp, poll_table); } /* Must be called with trace_types_lock mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; + int ret; while (trace_empty(iter)) { @@ -3306,15 +4398,6 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -3324,8 +4407,20 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracer_enabled && iter->pos) + if (!tracing_is_on() && iter->pos) break; + + mutex_unlock(&iter->mutex); + + ret = wait_on_pipe(iter); + + mutex_lock(&iter->mutex); + + if (ret) + return ret; + + if (signal_pending(current)) + return -EINTR; } return 1; @@ -3339,7 +4434,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - static struct tracer *old_tracer; + struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -3351,10 +4446,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; - *iter->trace = *current_trace; - } + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); /* @@ -3387,6 +4480,7 @@ waitagain: memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); + cpumask_clear(iter->started); iter->pos = -1; trace_event_read_lock(); @@ -3436,12 +4530,6 @@ out: return sret; } -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - __free_page(buf->page); -} - static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { @@ -3450,10 +4538,8 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, + .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -3505,11 +4591,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - static struct tracer *old_tracer; + struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -3519,10 +4606,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; - *iter->trace = *current_trace; - } + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); @@ -3547,7 +4632,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; @@ -3576,7 +4661,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; out_err: @@ -3588,26 +4673,56 @@ static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; - char buf[96]; - int r; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + char buf[64]; + int r = 0; + ssize_t ret; mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - r = sprintf(buf, "%lu (expanded: %lu)\n", - tr->entries >> 10, - trace_buf_size >> 10); - else - r = sprintf(buf, "%lu\n", tr->entries >> 10); + + if (cpu == RING_BUFFER_ALL_CPUS) { + int cpu, buf_size_same; + unsigned long size; + + size = 0; + buf_size_same = 1; + /* check if all cpu sizes are same */ + for_each_tracing_cpu(cpu) { + /* fill in the size from first enabled cpu */ + if (size == 0) + size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { + buf_size_same = 0; + break; + } + } + + if (buf_size_same) { + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + size >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", size >> 10); + } else + r = sprintf(buf, "X\n"); + } else + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); + mutex_unlock(&trace_types_lock); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + return ret; } static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; unsigned long val; int ret; @@ -3621,8 +4736,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - - ret = tracing_resize_ring_buffer(val); + ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; @@ -3642,7 +4756,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += tr->entries >> 10; + size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -3672,11 +4786,15 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { + struct trace_array *tr = inode->i_private; + /* disable tracing ? */ if (trace_flags & TRACE_ITER_STOP_ON_FREE) - tracing_off(); + tracer_tracing_off(tr); /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0); + tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + + trace_array_put(tr); return 0; } @@ -3686,23 +4804,27 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { unsigned long addr = (unsigned long)ubuf; + struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; struct page *pages[2]; + void *map_page[2]; int nr_pages = 1; ssize_t written; - void *page1; - void *page2; int offset; int size; int len; int ret; + int i; if (tracing_disabled) return -EINVAL; + if (!(trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; @@ -3737,13 +4859,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, goto out; } - page1 = kmap_atomic(pages[0]); - if (nr_pages == 2) - page2 = kmap_atomic(pages[1]); + for (i = 0; i < nr_pages; i++) + map_page[i] = kmap_atomic(pages[i]); local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { @@ -3757,10 +4878,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (nr_pages == 2) { len = PAGE_SIZE - offset; - memcpy(&entry->buf, page1 + offset, len); - memcpy(&entry->buf[len], page2, cnt - len); + memcpy(&entry->buf, map_page[0] + offset, len); + memcpy(&entry->buf[len], map_page[1], cnt - len); } else - memcpy(&entry->buf, page1 + offset, cnt); + memcpy(&entry->buf, map_page[0] + offset, cnt); if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -3768,42 +4889,78 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; - ring_buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); written = cnt; *fpos += written; out_unlock: - if (nr_pages == 2) - kunmap_atomic(page2); - kunmap_atomic(page1); - while (nr_pages > 0) - put_page(pages[--nr_pages]); + for (i = 0; i < nr_pages; i++){ + kunmap_atomic(map_page[i]); + put_page(pages[i]); + } out: return written; } static int tracing_clock_show(struct seq_file *m, void *v) { + struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", - i == trace_clock_id ? "[" : "", trace_clocks[i].name, - i == trace_clock_id ? "]" : ""); + i == tr->clock_id ? "[" : "", trace_clocks[i].name, + i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; } +static int tracing_set_clock(struct trace_array *tr, const char *clockstr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + tr->clock_id = i; + + ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); + + /* + * New clock may not be consistent with the previous clock. + * Reset the buffer so that it doesn't have incomparable timestamps. + */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&tr->max_buffer); +#endif + + mutex_unlock(&trace_types_lock); + + return 0; +} + static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; const char *clockstr; - int i; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -3815,35 +4972,204 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, clockstr = strstrip(buf); - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { - if (strcmp(trace_clocks[i].name, clockstr) == 0) - break; + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; + + *fpos += cnt; + + return cnt; +} + +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_clock_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int read; +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + struct seq_file *m; + int ret = 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + goto out; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + kfree(m); + goto out; + } + ret = 0; + + iter->tr = tr; + iter->trace_buffer = &tr->max_buffer; + iter->cpu_file = tracing_get_cpu(inode); + m->private = iter; + file->private_data = m; } - if (i == ARRAY_SIZE(trace_clocks)) - return -EINVAL; +out: + if (ret < 0) + trace_array_put(tr); - trace_clock_id = i; + return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + switch (val) { + case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } + if (tr->allocated_snapshot) + free_snapshot(tr); + break; + case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } +#endif + if (!tr->allocated_snapshot) { + ret = alloc_snapshot(tr); + if (ret < 0) + break; + } + local_irq_disable(); + /* Now, we're going to swap */ + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + update_max_tr(tr, current, smp_processor_id()); + else + update_max_tr_single(tr, current, iter->cpu_file); + local_irq_enable(); + break; + default: + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->max_buffer); + else + tracing_reset(&tr->max_buffer, iter->cpu_file); + } + break; + } + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: mutex_unlock(&trace_types_lock); + return ret; +} - *fpos += cnt; +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + int ret; - return cnt; + ret = tracing_release(inode, file); + + if (file->f_mode & FMODE_READ) + return ret; + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; } -static int tracing_clock_open(struct inode *inode, struct file *file) +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; - return single_open(file, tracing_clock_show, NULL); + struct ftrace_buffer_info *info; + int ret; + + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (info->iter.trace->use_max_tr) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.trace_buffer = &info->iter.tr->max_buffer; + + return ret; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3851,13 +5177,6 @@ static const struct file_operations tracing_max_lat_fops = { .llseek = generic_file_llseek, }; -static const struct file_operations tracing_ctrl_fops = { - .open = tracing_open_generic, - .read = tracing_ctrl_read, - .write = tracing_ctrl_write, - .llseek = generic_file_llseek, -}; - static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, @@ -3875,65 +5194,106 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_total_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { + .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .write = tracing_mark_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_clock_write, }; -struct ftrace_buffer_info { - struct trace_array *tr; - void *spare; - int cpu; - unsigned int read; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_lseek, + .release = tracing_snapshot_release, }; +static const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + +#endif /* CONFIG_TRACER_SNAPSHOT */ + static int tracing_buffers_open(struct inode *inode, struct file *filp) { - int cpu = (int)(long)inode->i_private; + struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; + int ret; if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) + if (!info) { + trace_array_put(tr); return -ENOMEM; + } + + mutex_lock(&trace_types_lock); - info->tr = &global_trace; - info->cpu = cpu; - info->spare = NULL; + info->iter.tr = tr; + info->iter.cpu_file = tracing_get_cpu(inode); + info->iter.trace = tr->current_trace; + info->iter.trace_buffer = &tr->trace_buffer; + info->spare = NULL; /* Force reading ring buffer for first read */ - info->read = (unsigned int)-1; + info->read = (unsigned int)-1; filp->private_data = info; - return nonseekable_open(inode, filp); + mutex_unlock(&trace_types_lock); + + ret = nonseekable_open(inode, filp); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + + return trace_poll(iter, filp, poll_table); } static ssize_t @@ -3941,56 +5301,101 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; ssize_t ret; - size_t size; + ssize_t size; if (!count) return 0; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + size = -EBUSY; + goto out_unlock; + } +#endif + if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); + info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + iter->cpu_file); + size = -ENOMEM; if (!info->spare) - return -ENOMEM; + goto out_unlock; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) goto read; - trace_access_lock(info->cpu); - ret = ring_buffer_read_page(info->tr->buffer, + again: + trace_access_lock(iter->cpu_file); + ret = ring_buffer_read_page(iter->trace_buffer->buffer, &info->spare, count, - info->cpu, 0); - trace_access_unlock(info->cpu); - if (ret < 0) - return 0; + iter->cpu_file, 0); + trace_access_unlock(iter->cpu_file); - info->read = 0; + if (ret < 0) { + if (trace_empty(iter)) { + if ((filp->f_flags & O_NONBLOCK)) { + size = -EAGAIN; + goto out_unlock; + } + mutex_unlock(&trace_types_lock); + ret = wait_on_pipe(iter); + mutex_lock(&trace_types_lock); + if (ret) { + size = ret; + goto out_unlock; + } + if (signal_pending(current)) { + size = -EINTR; + goto out_unlock; + } + goto again; + } + size = 0; + goto out_unlock; + } -read: + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) - return -EFAULT; + if (ret == size) { + size = -EFAULT; + goto out_unlock; + } size -= ret; *ppos += size; info->read += size; + out_unlock: + mutex_unlock(&trace_types_lock); + return size; } static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + mutex_lock(&trace_types_lock); + + __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(info->tr->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); kfree(info); + mutex_unlock(&trace_types_lock); + return 0; } @@ -4013,12 +5418,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4030,11 +5429,9 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; @@ -4061,30 +5458,41 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; int entries, size, i; - size_t ret; + ssize_t ret; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } +#endif + + if (splice_grow_spd(pipe, &spd)) { + ret = -ENOMEM; + goto out; + } if (*ppos & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: previous read must page-align\n"); ret = -EINVAL; goto out; } if (len & (PAGE_SIZE - 1)) { - WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); if (len < PAGE_SIZE) { ret = -EINVAL; goto out; @@ -4092,10 +5500,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - trace_access_lock(info->cpu); - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: + trace_access_lock(iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -4104,15 +5513,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; ref->ref = 1; - ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); + ref->buffer = iter->trace_buffer->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { kfree(ref); break; } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 1); + len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); @@ -4136,31 +5545,42 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.nr_pages++; *ppos += PAGE_SIZE; - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); } - trace_access_unlock(info->cpu); + trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { - if (flags & SPLICE_F_NONBLOCK) + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; - else - ret = 0; - /* TODO: block */ - goto out; + goto out; + } + mutex_unlock(&trace_types_lock); + ret = wait_on_pipe(iter); + mutex_lock(&trace_types_lock); + if (ret) + goto out; + if (signal_pending(current)) { + ret = -EINTR; + goto out; + } + goto again; } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); out: + mutex_unlock(&trace_types_lock); + return ret; } static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, + .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, .llseek = no_llseek, @@ -4170,8 +5590,10 @@ static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - unsigned long cpu = (unsigned long)filp->private_data; - struct trace_array *tr = &global_trace; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + struct trace_buffer *trace_buf = &tr->trace_buffer; + int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; @@ -4183,25 +5605,42 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_init(s); - cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); - cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); - cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + if (trace_clocks[tr->clock_id].in_ns) { + /* local or global for trace_clock */ + t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", + t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + } else { + /* counter or tsc mode for trace_clock */ + trace_seq_printf(s, "oldest event ts: %llu\n", + ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + + trace_seq_printf(s, "now ts: %llu\n", + ring_buffer_time_stamp(trace_buf->buffer, cpu)); + } + + cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "dropped events: %ld\n", cnt); - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); - usec_rem = do_div(t, USEC_PER_SEC); - trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4211,9 +5650,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf, } static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -4252,63 +5692,177 @@ static const struct file_operations tracing_dyn_info_fops = { .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) { - static int once; + unsigned long *count = (long *)data; + + if (!*count) + return; + + if (*count != -1) + (*count)--; - if (d_tracer) - return d_tracer; + tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "snapshot"); + + if (count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + if (ret >= 0) + alloc_snapshot(&global_trace); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +static __init int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline __init int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ + +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ + if (tr->dir) + return tr->dir; if (!debugfs_initialized()) return NULL; - d_tracer = debugfs_create_dir("tracing", NULL); + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + tr->dir = debugfs_create_dir("tracing", NULL); - if (!d_tracer && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'tracing'\n"); - return NULL; - } + if (!tr->dir) + pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return d_tracer; + return tr->dir; } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ + return tracing_init_dentry_tr(&global_trace); +} -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { - static int once; struct dentry *d_tracer; - if (d_percpu) - return d_percpu; - - d_tracer = tracing_init_dentry(); + if (tr->percpu_dir) + return tr->percpu_dir; + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - d_percpu = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); - if (!d_percpu && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'per_cpu'\n"); - return NULL; - } + WARN_ONCE(!tr->percpu_dir, + "Could not create debugfs directory 'per_cpu/%d'\n", cpu); - return d_percpu; + return tr->percpu_dir; } -static void tracing_init_debugfs_percpu(long cpu) +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, + void *data, long cpu, const struct file_operations *fops) { - struct dentry *d_percpu = tracing_dentry_percpu(); + struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + + if (ret) /* See tracing_get_cpu() */ + ret->d_inode->i_cdev = (void *)(cpu + 1); + return ret; +} + +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) +{ + struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ + if (!d_percpu) + return; + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { @@ -4317,18 +5871,29 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - trace_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); + trace_create_cpu_file("trace_pipe", 0444, d_cpu, + tr, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); + trace_create_cpu_file("trace", 0644, d_cpu, + tr, cpu, &tracing_fops); + + trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + tr, cpu, &tracing_buffers_fops); + + trace_create_cpu_file("stats", 0444, d_cpu, + tr, cpu, &tracing_stats_fops); - trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); + trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + tr, cpu, &tracing_entries_fops); - trace_create_file("stats", 0444, d_cpu, - (void *) cpu, &tracing_stats_fops); +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_cpu_file("snapshot", 0644, d_cpu, + tr, cpu, &snapshot_fops); + + trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + tr, cpu, &snapshot_raw_fops); +#endif } #ifdef CONFIG_FTRACE_SELFTEST @@ -4339,6 +5904,7 @@ static void tracing_init_debugfs_percpu(long cpu) struct trace_option_dentry { struct tracer_opt *opt; struct tracer_flags *flags; + struct trace_array *tr; struct dentry *entry; }; @@ -4374,7 +5940,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(current_trace, topt->flags, + ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) @@ -4413,6 +5979,7 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = &global_trace; long index = (long)filp->private_data; unsigned long val; int ret; @@ -4423,7 +5990,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - set_tracer_flags(1 << index, val); + + mutex_lock(&trace_types_lock); + ret = set_tracer_flag(tr, 1 << index, val); + mutex_unlock(&trace_types_lock); + + if (ret < 0) + return ret; *ppos += cnt; @@ -4453,40 +6026,41 @@ struct dentry *trace_create_file(const char *name, } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; - static struct dentry *t_options; - if (t_options) - return t_options; + if (tr->options) + return tr->options; - d_tracer = tracing_init_dentry(); + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - t_options = debugfs_create_dir("options", d_tracer); - if (!t_options) { + tr->options = debugfs_create_dir("options", d_tracer); + if (!tr->options) { pr_warning("Could not create debugfs directory 'options'\n"); return NULL; } - return t_options; + return tr->options; } static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, + struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; + topt->tr = tr; topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); @@ -4494,7 +6068,7 @@ create_trace_option_file(struct trace_option_dentry *topt, } static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct tracer_flags *flags; @@ -4519,7 +6093,7 @@ create_trace_option_files(struct tracer *tracer) return NULL; for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(&topts[cnt], flags, + create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); return topts; @@ -4542,11 +6116,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts) } static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, + const char *option, long index) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; @@ -4554,87 +6129,429 @@ create_trace_option_core_file(const char *option, long index) &trace_options_core_fops); } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; int i; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(trace_options[i], i); + create_trace_option_core_file(tr, trace_options[i], i); } -static __init int tracer_init_debugfs(void) +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) { - struct dentry *d_tracer; - int cpu; + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; - trace_access_lock_init(); + r = tracer_tracing_is_on(tr); + r = sprintf(buf, "%d\n", r); - d_tracer = tracing_init_dentry(); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} - trace_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + unsigned long val; + int ret; - trace_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; - trace_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); + if (buffer) { + mutex_lock(&trace_types_lock); + if (val) { + tracer_tracing_on(tr); + if (tr->current_trace->start) + tr->current_trace->start(tr); + } else { + tracer_tracing_off(tr); + if (tr->current_trace->stop) + tr->current_trace->stop(tr); + } + mutex_unlock(&trace_types_lock); + } - trace_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + (*ppos)++; - trace_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); + return cnt; +} - trace_create_file("current_tracer", 0644, d_tracer, - &global_trace, &set_tracer_fops); +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic_tr, + .read = rb_simple_read, + .write = rb_simple_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ + enum ring_buffer_flags rb_flags; + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + buf->tr = tr; + + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; + + buf->data = alloc_percpu(struct trace_array_cpu); + if (!buf->data) { + ring_buffer_free(buf->buffer); + return -ENOMEM; + } + + /* Allocate the first page for all buffers */ + set_buffer_entries(&tr->trace_buffer, + ring_buffer_size(tr->trace_buffer.buffer, 0)); + + return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ + int ret; + + ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + if (ret) + return ret; #ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); + ret = allocate_trace_buffer(tr, &tr->max_buffer, + allocate_snapshot ? size : 1); + if (WARN_ON(ret)) { + ring_buffer_free(tr->trace_buffer.buffer); + free_percpu(tr->trace_buffer.data); + return -ENOMEM; + } + tr->allocated_snapshot = allocate_snapshot; + + /* + * Only the top level trace array gets its snapshot allocated + * from the kernel command line. + */ + allocate_snapshot = false; #endif + return 0; +} - trace_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); +static void free_trace_buffer(struct trace_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} - trace_create_file("README", 0444, d_tracer, - NULL, &tracing_readme_fops); +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + free_trace_buffer(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + free_trace_buffer(&tr->max_buffer); +#endif +} + +static int new_instance_create(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + ret = -ENOMEM; + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out_unlock; + + tr->name = kstrdup(name, GFP_KERNEL); + if (!tr->name) + goto out_free_tr; + + if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) + goto out_free_tr; + + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + + raw_spin_lock_init(&tr->start_lock); + + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + tr->current_trace = &nop_trace; + + INIT_LIST_HEAD(&tr->systems); + INIT_LIST_HEAD(&tr->events); + + if (allocate_trace_buffers(tr, trace_buf_size) < 0) + goto out_free_tr; + + tr->dir = debugfs_create_dir(name, trace_instance_dir); + if (!tr->dir) + goto out_free_tr; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) { + debugfs_remove_recursive(tr->dir); + goto out_free_tr; + } + + init_tracer_debugfs(tr, tr->dir); + + list_add(&tr->list, &ftrace_trace_arrays); + + mutex_unlock(&trace_types_lock); + + return 0; + + out_free_tr: + free_trace_buffers(tr); + free_cpumask_var(tr->tracing_cpumask); + kfree(tr->name); + kfree(tr); + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; + +} + +static int instance_delete(const char *name) +{ + struct trace_array *tr; + int found = 0; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + found = 1; + break; + } + } + if (!found) + goto out_unlock; + + ret = -EBUSY; + if (tr->ref) + goto out_unlock; + + list_del(&tr->list); + + tracing_set_nop(tr); + event_trace_del_tracer(tr); + ftrace_destroy_function_files(tr); + debugfs_remove_recursive(tr->dir); + free_trace_buffers(tr); + + kfree(tr->name); + kfree(tr); + + ret = 0; + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the new_instance_create() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = new_instance_create(dentry->d_iname); + + mutex_lock(&inode->i_mutex); + + return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* The caller did a dget() on dentry */ + mutex_unlock(&dentry->d_inode->i_mutex); + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the instance_delete() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = instance_delete(dentry->d_iname); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = instance_mkdir, + .rmdir = instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ + trace_instance_dir = debugfs_create_dir("instances", d_tracer); + if (WARN_ON(!trace_instance_dir)) + return; + + /* Hijack the dir inode operations, to allow mkdir */ + trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ + int cpu; + + trace_create_file("available_tracers", 0444, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + tr, &set_tracer_fops); + + trace_create_file("tracing_cpumask", 0644, d_tracer, + tr, &tracing_cpumask_fops); + + trace_create_file("trace_options", 0644, d_tracer, + tr, &tracing_iter_fops); + + trace_create_file("trace", 0644, d_tracer, + tr, &tracing_fops); trace_create_file("trace_pipe", 0444, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); + tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); + tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", 0444, d_tracer, - &global_trace, &tracing_total_entries_fops); + tr, &tracing_total_entries_fops); - trace_create_file("free_buffer", 0644, d_tracer, - &global_trace, &tracing_free_buffer_fops); + trace_create_file("free_buffer", 0200, d_tracer, + tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); + tr, &tracing_mark_fops); + + trace_create_file("trace_clock", 0644, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("tracing_on", 0644, d_tracer, + tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tr->max_latency, &tracing_max_lat_fops); +#endif + + if (ftrace_create_function_files(tr, d_tracer)) + WARN(1, "Could not allocate function filter files"); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + tr, &snapshot_fops); +#endif + + for_each_tracing_cpu(cpu) + tracing_init_debugfs_percpu(tr, cpu); + +} + +static __init int tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + + trace_access_lock_init(); + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + init_tracer_debugfs(&global_trace, d_tracer); + + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + + trace_create_file("README", 0444, d_tracer, + NULL, &tracing_readme_fops); trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("trace_clock", 0644, d_tracer, NULL, - &trace_clock_fops); + trace_create_file("saved_cmdlines_size", 0644, d_tracer, + NULL, &tracing_saved_cmdlines_size_fops); #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif - create_trace_options_dir(); + create_trace_instances(d_tracer); - for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(cpu); + create_trace_options_dir(&global_trace); return 0; } @@ -4690,8 +6607,8 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= 1000) - s->len = 1000; + if (s->len >= TRACE_MAX_PRINT) + s->len = TRACE_MAX_PRINT; /* should be zero ended, but we are paranoid. */ s->buffer[s->len] = 0; @@ -4704,45 +6621,54 @@ trace_printk_seq(struct trace_seq *s) void trace_init_global_iter(struct trace_iterator *iter) { iter->tr = &global_trace; - iter->trace = current_trace; - iter->cpu_file = TRACE_PIPE_ALL_CPU; + iter->trace = iter->tr->current_trace; + iter->cpu_file = RING_BUFFER_ALL_CPUS; + iter->trace_buffer = &global_trace.trace_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->trace_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static atomic_t dump_running; unsigned int old_userobj; - static int dump_ran; unsigned long flags; int cnt = 0, cpu; - /* only one dump */ - local_irq_save(flags); - arch_spin_lock(&ftrace_dump_lock); - if (dump_ran) - goto out; - - dump_ran = 1; + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ tracing_off(); - /* Did function tracer already get disabled? */ - if (ftrace_is_dead()) { - printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - printk("# MAY BE MISSING FUNCTION EVENTS\n"); - } - - if (disable_tracing) - ftrace_kill(); + local_irq_save(flags); + /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4750,13 +6676,9 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - /* Simulate the iterator */ - iter.tr = &global_trace; - iter.trace = current_trace; - switch (oops_dump_mode) { case DUMP_ALL: - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; break; case DUMP_ORIG: iter.cpu_file = raw_smp_processor_id(); @@ -4765,11 +6687,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) goto out_enable; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; } printk(KERN_TRACE "Dumping ftrace buffer:\n"); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + /* * We need to stop all tracing on all CPUS to read the * the next buffer. This is a bit expensive, but is @@ -4798,6 +6726,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); } + touch_nmi_watchdog(); trace_printk_seq(&iter.seq); } @@ -4808,84 +6737,82 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - /* Re-enable tracing if requested */ - if (!disable_tracing) { - trace_flags |= old_userobj; + trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); - } - tracing_on(); + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - - out: - arch_spin_unlock(&ftrace_dump_lock); + atomic_dec(&dump_running); local_irq_restore(flags); } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ - __ftrace_dump(true, oops_dump_mode); -} EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { int ring_buf_size; - enum ring_buffer_flags rb_flags; - int i; int ret = -ENOMEM; if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; + /* Only allocate trace_printk buffers if a trace_printk exists */ + if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + /* Must be called before global_trace.buffer is allocated */ + trace_printk_init_buffers(); + /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - cpumask_copy(tracing_buffer_mask, cpu_possible_mask); - cpumask_copy(tracing_cpumask, cpu_all_mask); + cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); + + raw_spin_lock_init(&global_trace.start_lock); + + /* Used for event triggers */ + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_free_cpumask; + + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); - if (!global_trace.buffer) { + if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_cpumask; + goto out_free_savedcmd; } - global_trace.entries = ring_buffer_size(global_trace.buffer); + if (global_trace.buffer_disabled) + tracing_off(); -#ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, rb_flags); - if (!max_tr.buffer) { - printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); - WARN_ON(1); - ring_buffer_free(global_trace.buffer); - goto out_free_cpumask; + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warning("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } - max_tr.entries = 1; -#endif - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_tr_data, i); - } + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ + global_trace.current_trace = &nop_trace; + + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - trace_init_cmdlines(); + ftrace_init_global_array_ops(&global_trace); register_tracer(&nop_trace); - current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -4894,10 +6821,29 @@ __init static int tracer_alloc_buffers(void) register_die_notifier(&trace_die_notifier); + global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + + INIT_LIST_HEAD(&global_trace.systems); + INIT_LIST_HEAD(&global_trace.events); + list_add(&global_trace.list, &ftrace_trace_arrays); + + while (trace_boot_options) { + char *option; + + option = strsep(&trace_boot_options, ","); + trace_set_options(&global_trace, option); + } + + register_snapshot_cmd(); + return 0; +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); +out_free_temp_buffer: + ring_buffer_free(temp_buffer); out_free_cpumask: - free_cpumask_var(tracing_cpumask); + free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); out: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6..9258f5a815d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,3 +1,4 @@ + #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H @@ -12,6 +13,12 @@ #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#include <linux/compiler.h> + +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h> /* For NR_SYSCALLS */ +#include <asm/syscall.h> /* some archs define it here */ +#endif enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -29,6 +36,7 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BLK, + TRACE_BPUTS, __TRACE_LAST_TYPE, }; @@ -56,17 +64,23 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ - struct struct_name { \ - struct trace_entry ent; \ - tstruct \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ } #undef TP_ARGS #define TP_ARGS(args...) args #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -112,10 +126,13 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, }; #define TRACE_BUF_SIZE 1024 +struct trace_array; + /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started @@ -125,6 +142,7 @@ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ + unsigned long entries; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -135,24 +153,114 @@ struct trace_array_cpu { unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; - uid_t uid; + kuid_t uid; char comm[TASK_COMM_LEN]; }; +struct tracer; + +struct trace_buffer { + struct trace_array *tr; + struct ring_buffer *buffer; + struct trace_array_cpu __percpu *data; + cycle_t time_start; + int cpu; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: */ struct trace_array { - struct ring_buffer *buffer; - unsigned long entries; - int cpu; - cycle_t time_start; - struct task_struct *waiter; - struct trace_array_cpu *data[NR_CPUS]; + struct list_head list; + char *name; + struct trace_buffer trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * The max_buffer is used to snapshot the trace when a maximum + * latency is reached, or when the user initiates a snapshot. + * Some tracers will use this to store a maximum trace while + * it continues examining live traces. + * + * The buffers for the max_buffer are set up the same as the trace_buffer + * When a snapshot is taken, the buffer of the max_buffer is swapped + * with the buffer of the trace_buffer and the buffers are reset for + * the trace_buffer so the tracing can continue. + */ + struct trace_buffer max_buffer; + bool allocated_snapshot; + unsigned long max_latency; +#endif + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; + int buffer_disabled; +#ifdef CONFIG_FTRACE_SYSCALLS + int sys_refcount_enter; + int sys_refcount_exit; + struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; +#endif + int stop_count; + int clock_id; + struct tracer *current_trace; + unsigned int flags; + raw_spinlock_t start_lock; + struct dentry *dir; + struct dentry *options; + struct dentry *percpu_dir; + struct dentry *event_dir; + struct list_head systems; + struct list_head events; + cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + int ref; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops *ops; + /* function tracing enabled */ + int function_enabled; +#endif }; +enum { + TRACE_ARRAY_FL_GLOBAL = (1 << 0) +}; + +extern struct list_head ftrace_trace_arrays; + +extern struct mutex trace_types_lock; + +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ + struct trace_array *tr; + + if (list_empty(&ftrace_trace_arrays)) + return NULL; + + tr = list_entry(ftrace_trace_arrays.prev, + typeof(*tr), list); + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + return tr; +} + #define FTRACE_CMP_TYPE(var, type) \ __builtin_types_compatible_p(typeof(var), type *) @@ -188,6 +296,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -232,7 +341,6 @@ struct tracer_flags { * @stop: called when tracing is paused (echo 0 > tracing_enabled) * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe @@ -251,7 +359,6 @@ struct tracer { void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); - void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, @@ -270,24 +377,69 @@ struct tracer { void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ - int (*set_flag)(u32 old_flags, u32 bit, int set); + int (*set_flag)(struct trace_array *tr, + u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct trace_array *tr, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; - int print_max; - int use_max_tr; + int enabled; + bool print_max; + bool allow_instances; +#ifdef CONFIG_TRACER_MAX_TRACE + bool use_max_tr; +#endif }; /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + + /* INTERNAL_BITs must be greater than FTRACE_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function @@ -295,28 +447,98 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<13) + TRACE_IRQ_BIT, +}; + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) + +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + if (!bit) + return; -#define TRACE_PIPE_ALL_CPU -1 + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void trace_wake_up(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; @@ -327,9 +549,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long len, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -337,6 +556,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +void __buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); @@ -345,14 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void default_wait_pipe(struct trace_iterator *iter); -void poll_wait_pipe(struct trace_iterator *iter); - -void ftrace(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long ip, - unsigned long parent_ip, - unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, @@ -385,12 +599,9 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr); void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); -void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; + +loff_t tracing_lseek(struct file *file, loff_t offset, int whence); extern cpumask_var_t __read_mostly tracing_buffer_mask; @@ -402,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); @@ -450,13 +659,13 @@ extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -#endif -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -479,6 +688,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data __refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -492,13 +710,13 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; -extern int trace_clock_id; - /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -509,6 +727,10 @@ extern int trace_clock_id; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 +#define TRACE_GRAPH_PRINT_TAIL 0x80 +#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 +#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -528,15 +750,16 @@ extern void __trace_graph_return(struct trace_array *tr, #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_filter_enabled) + if (!ftrace_graph_count) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -556,11 +779,31 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_notrace_count) + return 0; + + for (i = 0; i < ftrace_graph_notrace_count; i++) { + if (addr == ftrace_graph_notrace_funcs[i]) + return 1; + } + + return 0; +} #else static inline int ftrace_graph_addr(unsigned long addr) { return 1; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t @@ -573,6 +816,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct task_struct *task) { if (list_empty(&ftrace_pids)) @@ -581,13 +825,47 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } static inline int ftrace_is_dead(void) { return 0; } -#endif +static inline int +ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + return 0; +} +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); +#else +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ + +int ftrace_event_is_function(struct ftrace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces @@ -656,6 +934,8 @@ enum trace_iterator_flags { TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, + TRACE_ITER_MARKERS = 0x1000000, + TRACE_ITER_FUNCTION = 0x2000000, }; /* @@ -694,16 +974,10 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); -/* trace event type bit fields, not numeric */ -enum { - TRACE_EVENT_TYPE_PRINTF = 1, - TRACE_EVENT_TYPE_RAW = 2, -}; - struct ftrace_event_field { struct list_head link; - char *name; - char *type; + const char *name; + const char *type; int filter_type; int offset; int size; @@ -721,12 +995,19 @@ struct event_filter { struct event_subsystem { struct list_head list; const char *name; - struct dentry *entry; struct event_filter *filter; - int nr_events; int ref_count; }; +struct ftrace_subsystem_dir { + struct list_head list; + struct event_subsystem *subsystem; + struct trace_array *tr; + struct dentry *entry; + int ref_count; + int nr_events; +}; + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) @@ -766,9 +1047,7 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST struct ftrace_event_field *field; -#endif int offset; int not; int op; @@ -778,52 +1057,255 @@ struct filter_pred { unsigned short right; }; -extern struct list_head ftrace_common_fields; - extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_call *call, +extern int apply_event_filter(struct ftrace_event_file *file, char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); +extern int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name); -static inline int -filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } +extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); - return 0; -} +extern struct ftrace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); -extern void trace_event_enable_cmd_record(bool enable); +static inline void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} extern struct mutex event_mutex; extern struct list_head ftrace_events; +extern const struct file_operations event_trigger_fops; + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +struct event_trigger_data { + unsigned long count; + int ref; + struct event_trigger_ops *ops; + struct event_command *cmd_ops; + struct event_filter __rcu *filter; + char *filter_str; + void *private_data; + struct list_head list; +}; + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @func: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command). + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). + */ +struct event_trigger_ops { + void (*func)(struct event_trigger_data *data); + int (*init)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + void (*free)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/ftrace_event.h. + * + * @post_trigger: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * All the methods below, except for @set_filter(), must be + * implemented. + * + * @func: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_trigger_ops @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_trigger_ops @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + * event_trigger_ops implementation associated with the command. + */ +struct event_command { + struct list_head list; + char *name; + enum event_trigger_type trigger_type; + bool post_trigger; + int (*func)(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *params); + int (*reg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + void (*unreg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + int (*set_filter)(char *filter_str, + struct event_trigger_data *data, + struct ftrace_event_file *file); + struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +extern int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); + extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; +extern const char *__start___tracepoint_str[]; +extern const char *__stop___tracepoint_str[]; + +void trace_printk_init_buffers(void); +void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) + #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_##call; + __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 00000000000..40a14cbcf8e --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,198 @@ +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + u64 stddev; + u64 seed; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actualy number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others + * run as well. + */ + cond_resched(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* These don't need to be reset but reset them anyway */ + bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 00000000000..3c1df1df4e2 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str), + + TP_ARGS(str), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + ), + + TP_printk("%s", __entry->str), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 8d3538b4ea5..697fb9bac8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; + struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) local_irq_save(flags); cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + if (atomic_inc_return(&data->disabled) != 1) goto out; pc = preempt_count(); - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) @@ -76,11 +78,11 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); out: - atomic_dec(&tr->data[cpu]->disabled); + atomic_dec(&data->disabled); local_irq_restore(flags); } @@ -199,7 +201,7 @@ __init static int init_branch_tracer(void) } return register_tracer(&branch_trace); } -device_initcall(init_branch_tracer); +core_initcall(init_branch_tracer); #else static inline diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cb..57b67b1f24d 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@ #include <linux/ktime.h> #include <linux/trace_clock.h> -#include "trace.h" - /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void) return clock; } +EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, @@ -58,6 +57,17 @@ u64 notrace trace_clock(void) return local_clock(); } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + * Note that this use of jiffies_64 is not completely safe on + * 32-bit systems. But the window is tiny, and the effect if + * we are affected is that we will have an obviously bogus + * timestamp on a trace event - i.e. not life threatening. + */ +u64 notrace trace_clock_jiffies(void) +{ + return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); +} /* * trace_clock_global(): special globally coherent trace clock @@ -86,7 +96,7 @@ u64 notrace trace_clock_global(void) local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = cpu_clock(this_cpu); + now = sched_clock_cpu(this_cpu); /* * If in an NMI context then dont risk lockups and return the * cpu_clock() time: diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f21..e2d027ac66a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@ /* * Function trace entry - function address and parent function address: */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, @@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + FILTER_TRACE_FN, + + perf_ftrace_event_register ); /* Function call entry */ @@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER ); /* Function return entry */ @@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth) + __entry->depth), + + FILTER_OTHER ); /* @@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -156,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -165,11 +181,14 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -181,11 +200,14 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); /* @@ -201,8 +223,10 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt) + F_printk("%pf: %s", + (void *)__entry->ip, __entry->fmt), + + FILTER_OTHER ); FTRACE_ENTRY(print, print_entry, @@ -214,8 +238,25 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%08lx %s", - __entry->ip, __entry->buf) + F_printk("%pf: %s", + (void *)__entry->ip, __entry->buf), + + FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + + TRACE_BPUTS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, str ) + ), + + F_printk("%pf: %s", + (void *)__entry->ip, __entry->str), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -234,7 +275,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width) + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -252,7 +295,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode) + __entry->map_id, __entry->opcode), + + FILTER_OTHER ); @@ -272,6 +317,8 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)", __entry->line, - __entry->func, __entry->file, __entry->correct) + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d..5d12bb407b4 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,33 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event)) { + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; + } + /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; @@ -44,23 +71,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret; + int ret = -ENOMEM; int cpu; - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; - ret = -ENOMEM; - list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -83,7 +104,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; @@ -108,10 +129,73 @@ fail: return ret; } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); @@ -130,6 +214,14 @@ int perf_trace_init(struct perf_event *p_event) return ret; } +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; @@ -146,47 +238,18 @@ int perf_trace_add(struct perf_event *p_event, int flags) list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); - return 0; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); } void perf_trace_del(struct perf_event *p_event, int flags) { - hlist_del_rcu(&p_event->hlist_entry); -} - -void perf_trace_destroy(struct perf_event *p_event) -{ struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); + hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) +void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -195,6 +258,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return NULL; + pc = preempt_count(); *rctxp = perf_swevent_get_recursion_context(); @@ -214,3 +281,90 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *pt_regs) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + + head = this_cpu_ptr(event_function.perf_events); + if (hlist_empty(head)) + return; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head, NULL); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934e..2de53628689 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,16 +27,45 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); +LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_common_fields); -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) -LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +#define SYSTEM_FL_FREE_NAME (1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ + return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ + return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ + return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} -struct list_head * +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + struct ftrace_event_file *___n; \ + list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file() \ + } + +static struct list_head * trace_get_fields(struct ftrace_event_call *event_call) { if (!event_call->class->get_fields) @@ -44,23 +73,45 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; - field = kzalloc(sizeof(*field), GFP_KERNEL); + field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) - goto err; - - field->name = kstrdup(name, GFP_KERNEL); - if (!field->name) - goto err; + return -ENOMEM; - field->type = kstrdup(type, GFP_KERNEL); - if (!field->type) - goto err; + field->name = name; + field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); @@ -74,13 +125,6 @@ static int __trace_define_field(struct list_head *head, const char *type, list_add(&field->link, head); return 0; - -err: - if (field) - kfree(field->name); - kfree(field); - - return -ENOMEM; } int trace_define_field(struct ftrace_event_call *call, const char *type, @@ -116,12 +160,11 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, padding); return ret; } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; @@ -129,9 +172,7 @@ void trace_destroy_fields(struct ftrace_event_call *call) head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); - kfree(field->type); - kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); } } @@ -147,29 +188,68 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len) { + struct ftrace_event_call *event_call = ftrace_file->event_call; + + local_save_flags(fbuffer->flags); + fbuffer->pc = preempt_count(); + fbuffer->ftrace_file = ftrace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + event_call->event.type, len, + fbuffer->flags, fbuffer->pc); + if (!fbuffer->event) + return NULL; + + fbuffer->entry = ring_buffer_event_data(fbuffer->event); + return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +{ + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); + +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + struct ftrace_event_file *file = data; + + WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->probe, - call); + file); case TRACE_REG_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->probe, - call); + file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; @@ -178,54 +258,108 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg); void trace_event_enable_cmd_record(bool enable) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + do_for_each_event_file(tr, file) { + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - } + } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, - int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) { + struct ftrace_event_call *call = file->event_call; int ret = 0; + int disable; switch (enable) { case 0: - if (call->flags & TRACE_EVENT_FL_ENABLED) { - call->flags &= ~TRACE_EVENT_FL_ENABLED; - if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + /* + * When soft_disable is set and enable is cleared, the sm_ref + * reference counter is decremented. If it reaches 0, we want + * to clear the SOFT_DISABLED flag but leave the event in the + * state that it was. That is, if the event was enabled and + * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED + * is set we do not want the event to be enabled before we + * clear the bit. + * + * When soft_disable is not set but the SOFT_MODE flag is, + * we do nothing. Do not disable the tracepoint, otherwise + * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + */ + if (soft_disable) { + if (atomic_dec_return(&file->sm_ref) > 0) + break; + disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; + clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } else + disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + + if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { + clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER); + call->class->reg(call, TRACE_REG_UNREGISTER, file); } + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ + if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { + /* + * When soft_disable is set and enable is set, we want to + * register the tracepoint for the event, but leave the event + * as is. That means, if the event was already enabled, we do + * nothing (but set SOFT_MODE). If the event is disabled, we + * set SOFT_DISABLED before enabling the event tracepoint, so + * it still seems to be disabled. + */ + if (!soft_disable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else { + if (atomic_inc_return(&file->sm_ref) > 1) + break; + set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + + /* Keep the event disabled, when going to SOFT_MODE. */ + if (soft_disable) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + if (trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - ret = call->class->reg(call, TRACE_REG_REGISTER); + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " - "%s\n", call->name); + "%s\n", ftrace_event_name(call)); break; } - call->flags |= TRACE_EVENT_FL_ENABLED; + set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + + /* WAS_ENABLED gets set but never cleared. */ + call->flags |= TRACE_EVENT_FL_WAS_ENABLED; } break; } @@ -233,13 +367,25 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, return ret; } -static void ftrace_clear_events(void) +int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) { - struct ftrace_event_call *call; + return __ftrace_event_enable_disable(file, enable, soft_disable); +} + +static int ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable) +{ + return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ + struct ftrace_event_file *file; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - ftrace_event_enable_disable(call, 0); + list_for_each_entry(file, &tr->events, list) { + ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } @@ -248,67 +394,141 @@ static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; - WARN_ON_ONCE(system->ref_count == 0); - if (--system->ref_count) + WARN_ON_ONCE(system_refcount(system) == 0); + if (system_refcount_dec(system)) return; + list_del(&system->list); + if (filter) { kfree(filter->filter_string); kfree(filter); } - kfree(system->name); + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { - WARN_ON_ONCE(system->ref_count == 0); - system->ref_count++; + WARN_ON_ONCE(system_refcount(system) == 0); + system_refcount_inc(system); +} + +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + dir->ref_count++; + __get_system(dir->subsystem); } -static void put_system(struct event_subsystem *system) +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + /* If the subsystem is about to be freed, the dir must be too */ + WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); + + __put_system(dir->subsystem); + if (!--dir->ref_count) + kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir) { mutex_lock(&event_mutex); - __put_system(system); + __put_system_dir(dir); mutex_unlock(&event_mutex); } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + debugfs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +static void remove_event_file_dir(struct ftrace_event_file *file) +{ + struct dentry *dir = file->dir; + struct dentry *child; + + if (dir) { + spin_lock(&dir->d_lock); /* probably unneeded */ + list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { + if (child->d_inode) /* probably unneeded */ + child->d_inode->i_private = NULL; + } + spin_unlock(&dir->d_lock); + + debugfs_remove_recursive(dir); + } + + list_del(&file->list); + remove_subsystem(file->system); + free_event_filter(file->filter); + kmem_cache_free(file_cachep, file); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(const char *match, const char *sub, - const char *event, int set) +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { + struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; int ret = -EINVAL; - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && - strcmp(match, call->name) != 0 && + strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; - if (event && strcmp(event, call->name) != 0) + if (event && strcmp(event, name) != 0) continue; - ftrace_event_enable_disable(call, set); + ftrace_event_enable_disable(file, set); ret = 0; } + + return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + int ret; + + mutex_lock(&event_mutex); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; @@ -336,7 +556,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(match, sub, event, set); + return __ftrace_set_clr_event(tr, match, sub, event, set); } /** @@ -353,7 +573,12 @@ static int ftrace_set_clr_event(char *buf, int set) */ int trace_set_clr_event(const char *system, const char *event, int set) { - return __ftrace_set_clr_event(NULL, system, event, set); + struct trace_array *tr = top_trace_array(); + + if (!tr) + return -ENODEV; + + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -365,6 +590,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) @@ -387,7 +614,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(parser.buffer + !set, set); + ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } @@ -403,17 +630,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { + list_for_each_entry_continue(file, &tr->events, list) { + call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg) - return call; + return file; } return NULL; @@ -421,30 +651,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = t_next(m, call, &l); - if (!call) + file = t_next(m, file, &l); + if (!file) break; } - return call; + return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->flags & TRACE_EVENT_FL_ENABLED) - return call; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return file; } return NULL; @@ -452,27 +684,29 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = s_next(m, call, &l); - if (!call) + file = s_next(m, file, &l); + if (!file) break; } - return call; + return file; } static int t_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", call->name); + seq_printf(m, "%s\n", ftrace_event_name(call)); return 0; } @@ -482,39 +716,41 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static int -ftrace_event_seq_open(struct inode *inode, struct file *file) -{ - const struct seq_operations *seq_ops; - - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - ftrace_clear_events(); - - seq_ops = inode->i_private; - return seq_open(file, seq_ops); -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; - char *buf; + struct ftrace_event_file *file; + unsigned long flags; + char buf[4] = "0"; - if (call->flags & TRACE_EVENT_FL_ENABLED) - buf = "1\n"; - else - buf = "0\n"; + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file)) + flags = file->flags; + mutex_unlock(&event_mutex); + + if (!file) + return -ENODEV; + + if (flags & FTRACE_EVENT_FL_ENABLED && + !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + strcpy(buf, "1"); + + if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || + flags & FTRACE_EVENT_FL_SOFT_MODE) + strcat(buf, "*"); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + strcat(buf, "\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file; unsigned long val; int ret; @@ -529,8 +765,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: case 1: + ret = -ENODEV; mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(call, val); + file = event_file_data(filp); + if (likely(file)) + ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; @@ -548,15 +787,19 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || !call->class->reg) + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; + if (!ftrace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -567,7 +810,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); + set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -589,7 +832,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; @@ -612,7 +856,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; @@ -632,71 +876,45 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_field *field; + struct ftrace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); + struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - if (unlikely(list_empty(common_head))) - return NULL; - - field = list_entry(common_head->prev, - struct ftrace_event_field, link); - return field; + node = common_head; + break; case FORMAT_FIELD_SEPERATOR: - if (unlikely(list_empty(head))) - return NULL; - - field = list_entry(head->prev, struct ftrace_event_field, link); - return field; + node = head; + break; case FORMAT_PRINTFMT: /* all done */ return NULL; } - field = v; - if (field->link.prev == common_head) + node = node->prev; + if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; - else if (field->link.prev == head) + else if (node == head) return (void *)FORMAT_PRINTFMT; - - field = list_entry(field->link.prev, struct ftrace_event_field, link); - - return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ - loff_t l = 0; - void *p; - - /* Start by showing the header */ - if (!*pos) - return (void *)FORMAT_HEADER; - - p = (void *)FORMAT_HEADER; - do { - p = f_next(m, p, &l); - } while (p && l < *pos); - - return p; + else + return node; } static int f_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = m->private; + struct ftrace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: - seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_printf(m, "format:\n"); return 0; @@ -711,8 +929,7 @@ static int f_show(struct seq_file *m, void *v) return 0; } - field = v; - + field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: @@ -739,8 +956,25 @@ static int f_show(struct seq_file *m, void *v) return 0; } +static void *f_start(struct seq_file *m, loff_t *pos) +{ + void *p = (void *)FORMAT_HEADER; + loff_t l = 0; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + if (!event_file_data(m->private)) + return ERR_PTR(-ENODEV); + + while (l < *pos && p) + p = f_next(m, p, &l); + + return p; +} + static void f_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { @@ -752,7 +986,6 @@ static const struct seq_operations trace_format_seq_ops = { static int trace_format_open(struct inode *inode, struct file *file) { - struct ftrace_event_call *call = inode->i_private; struct seq_file *m; int ret; @@ -761,7 +994,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - m->private = call; + m->private = file; return 0; } @@ -769,45 +1002,47 @@ static int trace_format_open(struct inode *inode, struct file *file) static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - int r; + int id = (long)event_file_data(filp); + char buf[32]; + int len; if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + if (unlikely(!id)) + return -ENODEV; - trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->event.type); + len = sprintf(buf, "%d\n", id); - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - kfree(s); - return r; + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file; struct trace_seq *s; - int r; + int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) return -ENOMEM; trace_seq_init(s); - print_event_filter(call, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) + print_event_filter(file, s); + mutex_unlock(&event_mutex); + + if (file) + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -818,9 +1053,9 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file; char *buf; - int err; + int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; @@ -835,7 +1070,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_event_filter(call, buf); + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) + err = apply_event_filter(file, buf); + mutex_unlock(&event_mutex); + free_page((unsigned long) buf); if (err < 0) return err; @@ -850,43 +1090,101 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; + struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_array *tr; int ret; - if (!inode->i_private) - goto skip_search; + if (tracing_is_disabled()) + return -ENODEV; /* Make sure the system still exists */ + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); - list_for_each_entry(system, &event_subsystems, list) { - if (system == inode->i_private) { - /* Don't open systems with no events */ - if (!system->nr_events) { - system = NULL; - break; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + list_for_each_entry(dir, &tr->systems, list) { + if (dir == inode->i_private) { + /* Don't open systems with no events */ + if (dir->nr_events) { + __get_system_dir(dir); + system = dir->subsystem; + } + goto exit_loop; } - __get_system(system); - break; } } + exit_loop: mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + + if (!system) + return -ENODEV; - if (system != inode->i_private) + /* Some versions of gcc think dir can be uninitialized here */ + WARN_ON(!dir); + + /* Still need to increment the ref count of the system */ + if (trace_array_get(tr) < 0) { + put_system(dir); return -ENODEV; + } - skip_search: ret = tracing_open_generic(inode, filp); - if (ret < 0 && system) - put_system(system); + if (ret < 0) { + trace_array_put(tr); + put_system(dir); + } return ret; } +static int system_tr_open(struct inode *inode, struct file *filp) +{ + struct ftrace_subsystem_dir *dir; + struct trace_array *tr = inode->i_private; + int ret; + + if (tracing_is_disabled()) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + /* Make a temporary dir that has no system but points to tr */ + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) { + trace_array_put(tr); + return -ENOMEM; + } + + dir->tr = tr; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) { + trace_array_put(tr); + kfree(dir); + return ret; + } + + filp->private_data = dir; + + return 0; +} + static int subsystem_release(struct inode *inode, struct file *file) { - struct event_subsystem *system = inode->i_private; + struct ftrace_subsystem_dir *dir = file->private_data; - if (system) - put_system(system); + trace_array_put(dir->tr); + + /* + * If dir->subsystem is NULL, then this is a temporary + * descriptor that was made for a trace_array to enable + * all subsystems. + */ + if (dir->subsystem) + put_system(dir); + else + kfree(dir); return 0; } @@ -895,7 +1193,8 @@ static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; @@ -920,7 +1219,7 @@ static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; char *buf; int err; @@ -937,7 +1236,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_subsystem_event_filter(system, buf); + err = apply_subsystem_event_filter(dir, buf); free_page((unsigned long) buf); if (err < 0) return err; @@ -971,6 +1270,10 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static int ftrace_event_avail_open(struct inode *inode, struct file *file); +static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -986,18 +1289,18 @@ static const struct seq_operations show_set_event_seq_ops = { }; static const struct file_operations ftrace_avail_fops = { - .open = ftrace_event_seq_open, + .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { - .open = ftrace_event_seq_open, + .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, - .release = seq_release, + .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { @@ -1015,7 +1318,6 @@ static const struct file_operations ftrace_event_format_fops = { }; static const struct file_operations ftrace_event_id_fops = { - .open = tracing_open_generic, .read = event_id_read, .llseek = default_llseek, }; @@ -1043,129 +1345,226 @@ static const struct file_operations ftrace_system_enable_fops = { .release = subsystem_release, }; +static const struct file_operations ftrace_tr_enable_fops = { + .open = system_tr_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, .llseek = default_llseek, }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) { - static struct dentry *d_tracer; - static struct dentry *d_events; + struct seq_file *m; + int ret; - if (d_events) - return d_events; + ret = seq_open(file, seq_ops); + if (ret < 0) + return ret; + m = file->private_data; + /* copy tr over to seq ops */ + m->private = inode->i_private; - d_tracer = tracing_init_dentry(); - if (!d_tracer) + return ret; +} + +static int ftrace_event_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + +static int +ftrace_event_avail_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_event_seq_ops; + + return ftrace_event_open(inode, file, seq_ops); +} + +static int +ftrace_event_set_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_event_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_events(tr); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ + struct event_subsystem *system; + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) return NULL; - d_events = debugfs_create_dir("events", d_tracer); - if (!d_events) - pr_warning("Could not create debugfs " - "'events' directory\n"); + system->ref_count = 1; + + /* Only allocate if dynamic (kprobes and modules) */ + if (!core_kernel_data((unsigned long)name)) { + system->ref_count |= SYSTEM_FL_FREE_NAME; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) + goto out_free; + } else + system->name = name; + + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) + goto out_free; + + list_add(&system->list, &event_subsystems); - return d_events; + return system; + + out_free: + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); + kfree(system); + return NULL; } static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, + struct ftrace_event_file *file, struct dentry *parent) { + struct ftrace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + system = dir->subsystem; if (strcmp(system->name, name) == 0) { - system->nr_events++; - return system->entry; + dir->nr_events++; + file->system = dir; + return dir->entry; } } - /* need to create new entry */ - system = kmalloc(sizeof(*system), GFP_KERNEL); - if (!system) { - pr_warning("No memory to create event subsystem %s\n", - name); - return d_events; + /* Now see if the system itself exists. */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) + break; } + /* Reset system variable when not found */ + if (&system->list == &event_subsystems) + system = NULL; - system->entry = debugfs_create_dir(name, d_events); - if (!system->entry) { - pr_warning("Could not create event subsystem %s\n", - name); - kfree(system); - return d_events; - } + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + goto out_fail; - system->nr_events = 1; - system->ref_count = 1; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) { - debugfs_remove(system->entry); - kfree(system); - return d_events; + if (!system) { + system = create_new_subsystem(name); + if (!system) + goto out_free; + } else + __get_system(system); + + dir->entry = debugfs_create_dir(name, parent); + if (!dir->entry) { + pr_warning("Failed to create system directory %s\n", name); + __put_system(system); + goto out_free; } - list_add(&system->list, &event_subsystems); - - system->filter = NULL; + dir->tr = tr; + dir->ref_count = 1; + dir->nr_events = 1; + dir->subsystem = system; + file->system = dir; - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); - if (!system->filter) { - pr_warning("Could not allocate filter for subsystem " - "'%s'\n", name); - return system->entry; - } - - entry = debugfs_create_file("filter", 0644, system->entry, system, + entry = debugfs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); + pr_warning("Could not create debugfs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, system, + trace_create_file("enable", 0644, dir->entry, dir, &ftrace_system_enable_fops); - return system->entry; + list_add(&dir->list, &tr->systems); + + return dir->entry; + + out_free: + kfree(dir); + out_fail: + /* Only print this message if failed on memory allocation */ + if (!dir || !system) + pr_warning("No memory to create event subsystem %s\n", + name); + return NULL; } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +event_create_dir(struct dentry *parent, struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; struct list_head *head; + struct dentry *d_events; + const char *name; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->class->system, d_events); - - call->dir = debugfs_create_dir(call->name, d_events); - if (!call->dir) { - pr_warning("Could not create debugfs " - "'%s' directory\n", call->name); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { + d_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!d_events) + return -ENOMEM; + } else + d_events = parent; + + name = ftrace_event_name(call); + file->dir = debugfs_create_dir(name, d_events); + if (!file->dir) { + pr_warning("Could not create debugfs '%s' directory\n", + name); return -1; } - if (call->class->reg) - trace_create_file("enable", 0644, call->dir, call, - enable); + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("enable", 0644, file->dir, file, + &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, file->dir, + (void *)(long)call->event.type, + &ftrace_event_id_fops); #endif /* @@ -1177,222 +1576,286 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, ret = call->class->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; + " events/%s\n", name); + return -1; } } - trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, file->dir, file, + &ftrace_event_filter_fops); - trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + + trace_create_file("format", 0444, file->dir, call, + &ftrace_event_format_fops); return 0; } -static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +static void remove_event_from_tracers(struct ftrace_event_call *call) { - struct dentry *d_events; - int ret; + struct ftrace_event_file *file; + struct trace_array *tr; + + do_for_each_event_file_safe(tr, file) { + if (file->event_call != call) + continue; + + remove_event_file_dir(file); + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); +} + +static void event_remove(struct ftrace_event_call *call) +{ + struct trace_array *tr; + struct ftrace_event_file *file; + + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + ftrace_event_enable_disable(file, 0); + destroy_preds(file); + /* + * The do_for_each_event_file() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + remove_event_from_tracers(call); + list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ + int ret = 0; + const char *name; - /* The linker may leave blanks */ - if (!call->name) + name = ftrace_event_name(call); + if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace events/%s\n", - call->name); - return ret; - } + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", + name); } - d_events = event_trace_events_dir(); - if (!d_events) - return -ENOENT; + return ret; +} + +static int +__register_event(struct ftrace_event_call *call, struct module *mod) +{ + int ret; + + ret = event_init(call); + if (ret < 0) + return ret; - ret = event_create_dir(call, d_events, id, enable, filter, format); - if (!ret) - list_add(&call->list, &ftrace_events); + list_add(&call->list, &ftrace_events); call->mod = mod; - return ret; + return 0; } -/* Add an additional event_call dynamically */ -int trace_add_event_call(struct ftrace_event_call *call) +static struct ftrace_event_file * +trace_create_new_event(struct ftrace_event_call *call, + struct trace_array *tr) { - int ret; - mutex_lock(&event_mutex); - ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - mutex_unlock(&event_mutex); - return ret; + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return NULL; + + file->event_call = call; + file->tr = tr; + atomic_set(&file->sm_ref, 0); + atomic_set(&file->tm_ref, 0); + INIT_LIST_HEAD(&file->triggers); + list_add(&file->list, &tr->events); + + return file; } -static void remove_subsystem_dir(const char *name) +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) { - struct event_subsystem *system; + struct ftrace_event_file *file; - if (strcmp(name, TRACE_SYSTEM) == 0) - return; + file = trace_create_new_event(call, tr); + if (!file) + return -ENOMEM; - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - debugfs_remove_recursive(system->entry); - list_del(&system->list); - __put_system(system); - } - break; - } - } + return event_create_dir(tr->event_dir, file); } /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. */ -static void __trace_remove_event_call(struct ftrace_event_call *call) +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr) { - ftrace_event_enable_disable(call, 0); - if (call->event.funcs) - __unregister_ftrace_event(&call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->class->system); + struct ftrace_event_file *file; + + file = trace_create_new_event(call, tr); + if (!file) + return -ENOMEM; + + return 0; } -/* Remove an event_call */ -void trace_remove_event_call(struct ftrace_event_call *call) +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call); + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) { + int ret; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); - down_write(&trace_event_mutex); - __trace_remove_event_call(call); - up_write(&trace_event_mutex); - mutex_unlock(&event_mutex); -} - -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) -#ifdef CONFIG_MODULES + ret = __register_event(call, NULL); + if (ret >= 0) + __add_event_to_tracers(call); -static LIST_HEAD(ftrace_module_file_list); + mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + return ret; +} /* - * Modules must own their file_operations to keep up with - * reference counting. + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem. */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; - -static struct ftrace_module_file_ops * -trace_create_file_ops(struct module *mod) +static void __trace_remove_event_call(struct ftrace_event_call *call) { - struct ftrace_module_file_ops *file_ops; + event_remove(call); + trace_destroy_fields(call); + destroy_call_preds(call); +} - /* - * This is a bit of a PITA. To allow for correct reference - * counting, modules must "own" their file_operations. - * To do this, we allocate the file operations that will be - * used in the event directory. - */ +static int probe_remove_event_call(struct ftrace_event_call *call) +{ + struct trace_array *tr; + struct ftrace_event_file *file; - file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); - if (!file_ops) - return NULL; +#ifdef CONFIG_PERF_EVENTS + if (call->perf_refcount) + return -EBUSY; +#endif + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + /* + * We can't rely on ftrace_event_enable_disable(enable => 0) + * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress + * TRACE_REG_UNREGISTER. + */ + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return -EBUSY; + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); - file_ops->mod = mod; + __trace_remove_event_call(call); - file_ops->id = ftrace_event_id_fops; - file_ops->id.owner = mod; + return 0; +} - file_ops->enable = ftrace_enable_fops; - file_ops->enable.owner = mod; +/* Remove an event_call */ +int trace_remove_event_call(struct ftrace_event_call *call) +{ + int ret; - file_ops->filter = ftrace_event_filter_fops; - file_ops->filter.owner = mod; + mutex_lock(&trace_types_lock); + mutex_lock(&event_mutex); + down_write(&trace_event_sem); + ret = probe_remove_event_call(call); + up_write(&trace_event_sem); + mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); - file_ops->format = ftrace_event_format_fops; - file_ops->format.owner = mod; + return ret; +} - list_add(&file_ops->list, &ftrace_module_file_list); +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) - return file_ops; -} +#ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { - struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call **call, **start, **end; - start = mod->trace_events; - end = mod->trace_events + mod->num_trace_events; - - if (start == end) + if (!mod->num_trace_events) return; - file_ops = trace_create_file_ops(mod); - if (!file_ops) + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); return; + } + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; for_each_event(call, start, end) { - __trace_add_event_call(*call, mod, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + __register_event(*call, mod); + __add_event_to_tracers(*call); } } static void trace_module_remove_events(struct module *mod) { - struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; - bool found = false; + bool clear_trace = false; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { - found = true; + if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) + clear_trace = true; __trace_remove_event_call(call); } } - - /* Now free the file_operations */ - list_for_each_entry(file_ops, &ftrace_module_file_list, list) { - if (file_ops->mod == mod) - break; - } - if (&file_ops->list != &ftrace_module_file_list) { - list_del(&file_ops->list); - kfree(file_ops); - } + up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded - * registered any events. + * registered any events that were used. The only worry is if + * a new module gets loaded, and takes on the same id as the events + * of this module. When printing out the buffer, traced events left + * over from this module may be passed to the new module events and + * unexpected results may occur. */ - if (found) - tracing_reset_current_online_cpus(); - up_write(&trace_event_mutex); + if (clear_trace) + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, @@ -1400,6 +1863,7 @@ static int trace_module_notify(struct notifier_block *self, { struct module *mod = data; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); switch (val) { case MODULE_STATE_COMING: @@ -1410,21 +1874,386 @@ static int trace_module_notify(struct notifier_block *self, break; } mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return 0; } -#else -static int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -#endif /* CONFIG_MODULES */ static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; +#endif /* CONFIG_MODULES */ + +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + ret = __trace_add_new_event(call, tr); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + ftrace_event_name(call)); + } +} + +struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct ftrace_event_file *file; + struct ftrace_event_call *call; + const char *name; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + name = ftrace_event_name(call); + + if (!name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (strcmp(event, name) == 0 && + strcmp(system, call->class->system) == 0) + return file; + } + return NULL; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *_data) +{ + struct event_probe_data *data = _data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "%s:%s:%s", + data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + data->file->event_call->class->system, + ftrace_event_name(data->file->event_call)); + + if (data->count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", data->count); + + return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + data->ref++; + return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(data->file, 0, 1); + module_put(data->file->event_call->mod); + kfree(data); + } + *pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enabled) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_file *file; + struct ftrace_probe_ops *ops; + struct event_probe_data *data; + const char *system; + const char *event; + char *number; + bool enable; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enabled || !param) + return -EINVAL; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + mutex_lock(&event_mutex); + + ret = -EINVAL; + file = find_event_file(tr, system, event); + if (!file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; + else + ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + ret = 0; + goto out; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + data->enable = enable; + data->count = -1; + data->file = file; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &data->count); + if (ret) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = __ftrace_event_enable_disable(file, 1, 1); + if (ret < 0) + goto out_put; + ret = register_ftrace_function_probe(glob, ops, data); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + mutex_unlock(&event_mutex); + return ret; + + out_disable: + __ftrace_event_enable_disable(file, 0, 1); + out_put: + module_put(file->event_call->mod); + out_free: + kfree(data); + goto out; +} + +static struct ftrace_func_command event_enable_cmd = { + .name = ENABLE_EVENT_STR, + .func = event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { + .name = DISABLE_EVENT_STR, + .func = event_enable_func, +}; + +static __init int register_event_cmds(void) +{ + int ret; + + ret = register_ftrace_command(&event_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_ftrace_command(&event_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_ftrace_command(&event_enable_cmd); + return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file; + int ret; + + + list_for_each_entry(file, &tr->events, list) { + ret = event_create_dir(tr->event_dir, file); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + ftrace_event_name(file->event_call)); + } +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + /* Early boot up should not have any modules loaded */ + if (WARN_ON_ONCE(call->mod)) + continue; + + ret = __trace_early_add_new_event(call, tr); + if (ret < 0) + pr_warning("Could not create early event %s\n", + ftrace_event_name(call)); + } +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file, *next; + + list_for_each_entry_safe(file, next, &tr->events, list) + remove_event_file_dir(file); +} + +static void __add_event_to_tracers(struct ftrace_event_call *call) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __trace_add_new_event(call, tr); +} extern struct ftrace_event_call *__start_ftrace_events[]; extern struct ftrace_event_call *__stop_ftrace_events[]; @@ -1434,44 +2263,32 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = 1; - tracing_selftest_disabled = 1; + ring_buffer_expanded = true; + tracing_selftest_disabled = true; return 1; } __setup("trace_event=", setup_trace_event); -static __init int event_trace_init(void) +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { - struct ftrace_event_call **call; - struct dentry *d_tracer; - struct dentry *entry; struct dentry *d_events; - int ret; - char *buf = bootup_event_buf; - char *token; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return 0; - - entry = debugfs_create_file("available_events", 0444, d_tracer, - (void *)&show_event_seq_ops, - &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_events' entry\n"); + struct dentry *entry; - entry = debugfs_create_file("set_event", 0644, d_tracer, - (void *)&show_set_event_seq_ops, - &ftrace_set_event_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_event' entry\n"); + entry = debugfs_create_file("set_event", 0644, parent, + tr, &ftrace_set_event_fops); + if (!entry) { + pr_warning("Could not create debugfs 'set_event' entry\n"); + return -ENOMEM; + } - d_events = event_trace_events_dir(); - if (!d_events) - return 0; + d_events = debugfs_create_dir("events", parent); + if (!d_events) { + pr_warning("Could not create debugfs 'events' directory\n"); + return -ENOMEM; + } /* ring buffer internal formats */ trace_create_file("header_page", 0444, d_events, @@ -1483,18 +2300,128 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - NULL, &ftrace_system_enable_fops); + tr, &ftrace_tr_enable_fops); - if (trace_define_common_fields()) - pr_warning("tracing: Failed to allocate common fields"); + tr->event_dir = d_events; + + return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); - for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_early_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ + mutex_lock(&event_mutex); + + /* Disable any event triggers and associated soft-disabled events */ + clear_event_triggers(tr); + + /* Disable any running events */ + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + + down_write(&trace_event_sem); + __trace_remove_event_dirs(tr); + debugfs_remove_recursive(tr->event_dir); + up_write(&trace_event_sem); + + tr->event_dir = NULL; + + mutex_unlock(&event_mutex); + + return 0; +} + +static __init int event_trace_memsetup(void) +{ + field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); + file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + return 0; +} + +static __init int event_trace_enable(void) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_call **iter, *call; + char *buf = bootup_event_buf; + char *token; + int ret; + + if (!tr) + return -ENODEV; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); } + /* + * We need the top trace array to have a working set of trace + * points at early init, before the debug files and directories + * are created. Create the file entries now, and attach them + * to the actual file dentries later. + */ + __trace_early_add_events(tr); + while (true) { token = strsep(&buf, ","); @@ -1503,17 +2430,57 @@ static __init int event_trace_init(void) if (!*token) continue; - ret = ftrace_set_clr_event(token, 1); + ret = ftrace_set_clr_event(tr, token, 1); if (ret) - pr_warning("Failed to enable trace event: %s\n", token); + pr_warn("Failed to enable trace event: %s\n", token); } + trace_printk_start_comm(); + + register_event_cmds(); + + register_trigger_cmds(); + + return 0; +} + +static __init int event_trace_init(void) +{ + struct trace_array *tr; + struct dentry *d_tracer; + struct dentry *entry; + int ret; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("available_events", 0444, d_tracer, + tr, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_events' entry\n"); + + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + + ret = early_event_add_tracer(d_tracer, tr); + if (ret) + return ret; + +#ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); - +#endif return 0; } +early_initcall(event_trace_memsetup); +core_initcall(event_trace_enable); fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1572,13 +2539,22 @@ static __init void event_test_stuff(void) */ static __init void event_trace_self_tests(void) { + struct ftrace_subsystem_dir *dir; + struct ftrace_event_file *file; struct ftrace_event_call *call; struct event_subsystem *system; + struct trace_array *tr; int ret; + tr = top_trace_array(); + if (!tr) + return; + pr_info("Running tests on trace events:\n"); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) @@ -1596,21 +2572,21 @@ static __init void event_trace_self_tests(void) continue; #endif - pr_info("Testing event %s: ", call->name); + pr_info("Testing event %s: ", ftrace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->flags & TRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } - ftrace_event_enable_disable(call, 1); + ftrace_event_enable_disable(file, 1); event_test_stuff(); - ftrace_event_enable_disable(call, 0); + ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } @@ -1619,7 +2595,9 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on trace event systems:\n"); - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + + system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) @@ -1627,7 +2605,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1636,10 +2614,12 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); - if (WARN_ON_ONCE(ret)) + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); + continue; + } pr_cont("OK\n"); } @@ -1649,7 +2629,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling all events\n"); return; @@ -1658,7 +2638,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; @@ -1672,7 +2652,8 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ring_buffer_event *event; struct ring_buffer *buffer; @@ -1701,7 +2682,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); + trace_buffer_unlock_commit(buffer, event, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1711,6 +2692,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 24aee712745..8a8631926a0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -44,6 +44,7 @@ enum filter_op_ids OP_LE, OP_GT, OP_GE, + OP_BAND, OP_NONE, OP_OPEN_PAREN, }; @@ -54,6 +55,7 @@ struct filter_op { int precedence; }; +/* Order must be the same as enum filter_op_ids above */ static struct filter_op filter_ops[] = { { OP_OR, "||", 1 }, { OP_AND, "&&", 2 }, @@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = { { OP_LE, "<=", 5 }, { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, + { OP_BAND, "&", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -81,6 +84,7 @@ enum { FILT_ERR_TOO_MANY_PREDS, FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, }; static char *err_text[] = { @@ -96,6 +100,7 @@ static char *err_text[] = { "Too many terms in predicate expression", "Missing field name and/or value", "Meaningless filter expression", + "Only 'ip' field is supported for function trace", }; struct opstack_op { @@ -154,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ case OP_GE: \ match = (*addr >= val); \ break; \ + case OP_BAND: \ + match = (*addr & val); \ + break; \ default: \ break; \ } \ @@ -629,17 +637,23 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +static inline struct event_filter *event_filter(struct ftrace_event_file *file) { - struct event_filter *filter; + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + return file->event_call->filter; + else + return file->filter; +} + +/* caller must hold event_mutex */ +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) +{ + struct event_filter *filter = event_filter(file); - mutex_lock(&event_mutex); - filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); - mutex_unlock(&event_mutex); + trace_seq_puts(s, "none\n"); } void print_subsystem_event_filter(struct event_subsystem *system, @@ -652,40 +666,13 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ - struct ftrace_event_field *field; - - list_for_each_entry(field, head, link) { - if (!strcmp(field->name, name)) - return field; - } - - return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ - struct ftrace_event_field *field; - struct list_head *head; - - field = __find_event_field(&ftrace_common_fields, name); - if (field) - return field; - - head = trace_get_fields(call); - return __find_event_field(head, name); -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { - stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); if (!stack->preds) return -ENOMEM; stack->index = n_preds; @@ -775,7 +762,11 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { + int i; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + kfree(filter->preds[i].ops); kfree(filter->preds); filter->preds = NULL; } @@ -783,11 +774,21 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void filter_disable(struct ftrace_event_call *call) +static void call_filter_disable(struct ftrace_event_call *call) { call->flags &= ~TRACE_EVENT_FL_FILTERED; } +static void filter_disable(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call_filter_disable(call); + else + file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} + static void __free_filter(struct event_filter *filter) { if (!filter) @@ -798,16 +799,35 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +void free_event_filter(struct event_filter *filter) +{ + __free_filter(filter); +} + +void destroy_call_preds(struct ftrace_event_call *call) +{ + __free_filter(call->filter); + call->filter = NULL; +} + +static void destroy_file_preds(struct ftrace_event_file *file) +{ + __free_filter(file->filter); + file->filter = NULL; +} + /* - * Called when destroying the ftrace_event_call. - * The call is being freed, so we do not need to worry about - * the call being currently used. This is for module code removing + * Called when destroying the ftrace_event_file. + * The file is being freed, so we do not need to worry about + * the file being currently used. This is for module code removing * the tracepoints from within it. */ -void destroy_preds(struct ftrace_event_call *call) +void destroy_preds(struct ftrace_event_file *file) { - __free_filter(call->filter); - call->filter = NULL; + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + destroy_call_preds(file->event_call); + else + destroy_file_preds(file); } static struct event_filter *__alloc_filter(void) @@ -826,8 +846,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) __free_preds(filter); - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); if (!filter->preds) return -ENOMEM; @@ -843,28 +862,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static inline void __remove_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + filter_disable(file); + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + remove_filter_string(call->filter); + else + remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct event_subsystem *system, + struct trace_array *tr) { + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - filter_disable(call); - remove_filter_string(call->filter); + __remove_filter(file); + } +} + +static inline void __free_subsystem_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { + __free_filter(call->filter); + call->filter = NULL; + } else { + __free_filter(file->filter); + file->filter = NULL; } } -static void filter_free_subsystem_filters(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system, + struct trace_array *tr) { + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - __free_filter(call->filter); - call->filter = NULL; + __free_subsystem_filter(file); } } @@ -900,6 +947,11 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -987,11 +1039,16 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } } else { if (field->is_signed) - ret = strict_strtoll(pred->regex.pattern, 0, &val); + ret = kstrtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->regex.pattern, 0, &val); + ret = kstrtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -1326,7 +1383,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, return NULL; } - field = find_event_field(call, operand1); + field = trace_find_event_field(call, operand1); if (!field) { parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; @@ -1334,10 +1391,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST pred.field = field; -#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1486,7 +1540,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) children = count_leafs(preds, &preds[root->left]); children += count_leafs(preds, &preds[root->right]); - root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); if (!root->ops) return -ENOMEM; @@ -1628,15 +1682,85 @@ fail: return err; } +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, + struct event_filter *filter) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + rcu_assign_pointer(call->filter, filter); + else + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + RCU_INIT_POINTER(call->filter, NULL); + else + RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + return true; + + if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && + (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) + return true; + + return false; +} + struct filter_list { struct list_head list; struct event_filter *filter; }; static int replace_system_preds(struct event_subsystem *system, + struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { + struct ftrace_event_file *file; struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; @@ -1644,8 +1768,8 @@ static int replace_system_preds(struct event_subsystem *system, bool fail = true; int err; - list_for_each_entry(call, &ftrace_events, list) { - + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; @@ -1655,18 +1779,20 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + event_set_no_set_filter_flag(file); else - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + event_clear_no_set_filter_flag(file); } - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; + call = file->event_call; + if (strcmp(call->class->system, system->name) != 0) continue; - if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + if (event_no_set_filter_flag(file)) continue; filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); @@ -1687,17 +1813,17 @@ static int replace_system_preds(struct event_subsystem *system, err = replace_preds(call, filter, ps, filter_string, false); if (err) { - filter_disable(call); + filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); append_filter_err(ps, filter); } else - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = call->filter; - rcu_assign_pointer(call->filter, filter_item->filter); + filter = event_filter(file); + event_set_filter(file, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1817,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call, return err; } +int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + return create_filter(call, filter_str, set_str, filterp); +} + /** * create_system_filter - create a filter for an event_subsystem * @system: event_subsystem to create a filter for @@ -1827,6 +1960,7 @@ static int create_filter(struct ftrace_event_call *call, * and always remembers @filter_str. */ static int create_system_filter(struct event_subsystem *system, + struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct event_filter *filter = NULL; @@ -1835,7 +1969,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, ps, filter_str); + err = replace_system_preds(system, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -1850,23 +1984,27 @@ static int create_system_filter(struct event_subsystem *system, return err; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +/* caller must hold event_mutex */ +int apply_event_filter(struct ftrace_event_file *file, char *filter_string) { + struct ftrace_event_call *call = file->event_call; struct event_filter *filter; - int err = 0; - - mutex_lock(&event_mutex); + int err; if (!strcmp(strstrip(filter_string), "0")) { - filter_disable(call); - filter = call->filter; + filter_disable(file); + filter = event_filter(file); + if (!filter) - goto out_unlock; - RCU_INIT_POINTER(call->filter, NULL); + return 0; + + event_clear_filter(file); + /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); - goto out_unlock; + + return 0; } err = create_filter(call, filter_string, true, &filter); @@ -1878,14 +2016,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) * string */ if (filter) { - struct event_filter *tmp = call->filter; + struct event_filter *tmp; + tmp = event_filter(file); if (!err) - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); else - filter_disable(call); + filter_disable(file); - rcu_assign_pointer(call->filter, filter); + event_set_filter(file, filter); if (tmp) { /* Make sure the call is done with the filter */ @@ -1893,39 +2032,39 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) __free_filter(tmp); } } -out_unlock: - mutex_unlock(&event_mutex); return err; } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; struct event_filter *filter; int err = 0; mutex_lock(&event_mutex); /* Make sure the system still has events */ - if (!system->nr_events) { + if (!dir->nr_events) { err = -ENODEV; goto out_unlock; } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system); + filter_free_subsystem_filters(system, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, filter_string, &filter); + err = create_system_filter(system, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter @@ -1950,6 +2089,148 @@ void ftrace_profile_free_filter(struct perf_event *event) __free_filter(filter); } +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + while ((sep = strchr(str, ','))) + *sep = ' '; + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret = -EINVAL; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; + + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) +{ + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } + + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { @@ -1970,9 +2251,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; err = create_filter(call, filter_str, false, &filter); - if (!err) - event->filter = filter; + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 00000000000..4747b476a03 --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1437 @@ +/* + * trace_events_trigger - trace event triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +static void +trigger_data_free(struct event_trigger_data *data) +{ + if (data->cmd_ops->set_filter) + data->cmd_ops->set_filter(NULL, data, NULL); + + synchronize_sched(); /* make sure current triggers exit before free */ + kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The ftrace_event_file associated with the event + * @rec: The trace entry for the event, NULL for unconditional invocation + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command. If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked. If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct ftrace_event_file *file, void *rec) +{ + struct event_trigger_data *data; + enum event_trigger_type tt = ETT_NONE; + struct event_filter *filter; + + if (list_empty(&file->triggers)) + return tt; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (!rec) { + data->ops->func(data); + continue; + } + filter = rcu_dereference_sched(data->filter); + if (filter && !filter_match_preds(filter, rec)) + continue; + if (data->cmd_ops->post_trigger) { + tt |= data->cmd_ops->trigger_type; + continue; + } + data->ops->func(data); + } + return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The ftrace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type & tt) + data->ops->func(data); + } +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ + struct ftrace_event_file *event_file = event_file_data(m->private); + + if (t == SHOW_AVAILABLE_TRIGGERS) + return NULL; + + return seq_list_next(t, &event_file->triggers, pos); +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *event_file; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + event_file = event_file_data(m->private); + if (unlikely(!event_file)) + return ERR_PTR(-ENODEV); + + if (list_empty(&event_file->triggers)) + return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + + return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ + mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct event_command *p; + + if (v == SHOW_AVAILABLE_TRIGGERS) { + seq_puts(m, "# Available triggers:\n"); + seq_putc(m, '#'); + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_reverse(p, &trigger_commands, list) + seq_printf(m, " %s", p->name); + seq_putc(m, '\n'); + mutex_unlock(&trigger_cmd_mutex); + return 0; + } + + data = list_entry(v, struct event_trigger_data, list); + data->ops->print(m, data->ops, data); + + return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { + .start = trigger_start, + .next = trigger_next, + .stop = trigger_stop, + .show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&event_mutex); + + if (unlikely(!event_file_data(file))) { + mutex_unlock(&event_mutex); + return -ENODEV; + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &event_triggers_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = file; + } + } + + mutex_unlock(&event_mutex); + + return ret; +} + +static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +{ + char *command, *next = buff; + struct event_command *p; + int ret = -EINVAL; + + command = strsep(&next, ": \t"); + command = (command[0] != '!') ? command : command + 1; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(p, file, buff, command, next); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_event_file *event_file; + ssize_t ret; + char *buf; + + if (!cnt) + return 0; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long)buf); + return -EFAULT; + } + buf[cnt] = '\0'; + strim(buf); + + mutex_lock(&event_mutex); + event_file = event_file_data(file); + if (unlikely(!event_file)) { + mutex_unlock(&event_mutex); + free_page((unsigned long)buf); + return -ENODEV; + } + ret = trigger_process_regex(event_file, buf); + mutex_unlock(&event_mutex); + + free_page((unsigned long)buf); + if (ret < 0) + goto out; + + *ppos += cnt; + ret = cnt; + out: + return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ + mutex_lock(&event_mutex); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + mutex_unlock(&event_mutex); + + return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ + return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ + return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { + .open = event_trigger_open, + .read = seq_read, + .write = event_trigger_write, + .llseek = tracing_lseek, + .release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +static __init int register_event_command(struct event_command *cmd) +{ + struct event_command *p; + int ret = 0; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &trigger_commands); + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +static __init int unregister_event_command(struct event_command *cmd) +{ + struct event_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, + void *data, char *filter_str) +{ + long count = (long)data; + + seq_printf(m, "%s", name); + + if (count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", count); + + if (filter_str) + seq_printf(m, " if %s\n", filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) + trigger_data_free(data); +} + +static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, + int trigger_enable) +{ + int ret = 0; + + if (trigger_enable) { + if (atomic_inc_return(&file->tm_ref) > 1) + return ret; + set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 1, 1); + } else { + if (atomic_dec_return(&file->tm_ref) > 0) + return ret; + clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 0, 1); + } + + return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable(). That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + struct event_trigger_data *data; + list_for_each_entry_rcu(data, &file->triggers, list) { + trace_event_trigger_enable_disable(file, 0); + if (data->ops->free) + data->ops->free(data->ops, data); + } + } +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The ftrace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +static void update_cond_flag(struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool set_cond = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->filter || data->cmd_ops->post_trigger) { + set_cond = true; + break; + } + } + + if (set_cond) + set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +/** + * event_trigger_callback - Generic event_command @func implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The ftrace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param: The params portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @func method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_callback(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + char *trigger = NULL; + char *number; + int ret; + + /* separate the trigger from the filter (t:n [if filter]) */ + if (param && isdigit(param[0])) + trigger = strsep(¶m, " \t"); + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + ret = 0; + out: + return ret; + + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +static int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data = trigger_data; + struct event_filter *filter = NULL, *tmp; + int ret = -EINVAL; + char *s; + + if (!filter_str) /* clear the current filter */ + goto assign; + + s = strsep(&filter_str, " \t"); + + if (!strlen(s) || strcmp(s, "if") != 0) + goto out; + + if (!filter_str) + goto out; + + /* The filter is for the 'trigger' event, not the triggered event */ + ret = create_event_filter(file->event_call, filter_str, false, &filter); + if (ret) + goto out; + assign: + tmp = rcu_access_pointer(data->filter); + + rcu_assign_pointer(data->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + free_event_filter(tmp); + } + + kfree(data->filter_str); + data->filter_str = NULL; + + if (filter_str) { + data->filter_str = kstrdup(filter_str, GFP_KERNEL); + if (!data->filter_str) { + free_event_filter(rcu_access_pointer(data->filter)); + data->filter = NULL; + ret = -ENOMEM; + } + } + out: + return ret; +} + +static void +traceon_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceon", m, (void *)data->count, + data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceoff", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { + .func = traceon_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { + .func = traceon_count_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { + .func = traceoff_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { + .func = traceoff_count_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_trigger_ops : + &traceon_trigger_ops; + else + ops = param ? &traceoff_count_trigger_ops : + &traceoff_trigger_ops; + + return ops; +} + +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data) +{ + tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + snapshot_trigger(data); +} + +static int +register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + int ret = register_trigger(glob, ops, data, file); + + if (ret > 0 && tracing_alloc_snapshot() != 0) { + unregister_trigger(glob, ops, data, file); + ret = 0; + } + + return ret; +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("snapshot", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { + .func = snapshot_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { + .func = snapshot_count_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ + return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .func = event_trigger_callback, + .reg = register_snapshot_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = snapshot_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_snapshot_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +/* + * Skip 3: + * stacktrace_trigger() + * event_triggers_post_call() + * ftrace_raw_event_xxx() + */ +#define STACK_SKIP 3 + +static void +stacktrace_trigger(struct event_trigger_data *data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + stacktrace_trigger(data); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("stacktrace", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { + .func = stacktrace_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { + .func = stacktrace_count_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ + return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { + .name = "stacktrace", + .trigger_type = ETT_STACKTRACE, + .post_trigger = true, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = stacktrace_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_stacktrace_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ + unregister_event_command(&trigger_traceon_cmd); + unregister_event_command(&trigger_traceoff_cmd); +} + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct enable_trigger_data { + struct ftrace_event_file *file; + bool enable; +}; + +static void +event_enable_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (enable_data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_trigger(data); +} + +static int +event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + seq_printf(m, "%s:%s:%s", + enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->file->event_call->class->system, + ftrace_event_name(enable_data->file->event_call)); + + if (data->count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", data->count); + + if (data->filter_str) + seq_printf(m, " if %s\n", data->filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +static void +event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + trace_event_enable_disable(enable_data->file, 0, 1); + module_put(enable_data->file->event_call->mod); + trigger_data_free(data); + kfree(enable_data); + } +} + +static struct event_trigger_ops event_enable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static int +event_enable_trigger_func(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct ftrace_event_file *event_enable_file; + struct enable_trigger_data *enable_data; + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + struct trace_array *tr = file->tr; + const char *system; + const char *event; + char *trigger; + char *number; + bool enable; + int ret; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (s:e:n [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + system = strsep(&trigger, ":"); + if (!trigger) + return -EINVAL; + + event = strsep(&trigger, ":"); + + ret = -EINVAL; + event_enable_file = find_event_file(tr, system, event); + if (!event_enable_file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + if (!enable_data) { + kfree(trigger_data); + goto out; + } + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + enable_data->enable = enable; + enable_data->file = event_enable_file; + trigger_data->private_data = enable_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + kfree(enable_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(event_enable_file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = trace_event_enable_disable(event_enable_file, 1, 1); + if (ret < 0) + goto out_put; + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + return ret; + + out_disable: + trace_event_enable_disable(event_enable_file, 0, 1); + out_put: + module_put(event_enable_file->event_call->mod); + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + kfree(enable_data); + goto out; +} + +static int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct enable_trigger_data *test_enable_data; + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + test_enable_data = test->private_data; + if (test_enable_data && + (test_enable_data->file == enable_data->file)) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +static void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *test_enable_data = test->private_data; + struct enable_trigger_data *enable_data; + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + enable_data = data->private_data; + if (enable_data && + (enable_data->file == test_enable_data->file)) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_trigger_ops : + &event_enable_trigger_ops; + else + ops = param ? &event_disable_count_trigger_ops : + &event_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_enable_cmd = { + .name = ENABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { + .name = DISABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_enable_cmd); + unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_enable_disable_cmds(); + + return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_traceon_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_traceoff_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_traceon_traceoff_cmds(); + + return ret; +} + +__init int register_trigger_cmds(void) +{ + register_trigger_traceon_traceoff_cmds(); + register_trigger_snapshot_cmd(); + register_trigger_stacktrace_cmd(); + register_trigger_enable_disable_cmds(); + + return 0; +} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae..d4ddde28a81 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) @@ -44,21 +54,22 @@ #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -77,22 +88,19 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ - mutex_unlock(&event_storage_mutex); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; \ } while (0); @@ -104,7 +112,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -112,17 +120,18 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ + 0, is_signed_type(type), filter_type);\ if (ret) \ return ret; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -int \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +static int __init \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ + int filter_type = filter; \ \ tstruct; \ \ @@ -150,24 +159,39 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define __dynamic_array(type, item) #undef F_printk -#define F_printk(fmt, args...) #fmt ", " __stringify(args) +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ \ -struct ftrace_event_class event_class_ftrace_##call = { \ +struct ftrace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ }; \ \ struct ftrace_event_call __used event_##call = { \ - .name = #call, \ - .event.type = etype, \ .class = &event_class_ftrace_##call, \ + { \ + .name = #call, \ + }, \ + .event.type = etype, \ .print_fmt = print, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), filter, NULL) + +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + #include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db0..57f0ec962d2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -7,115 +7,164 @@ * Based on code from the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" -/* function tracing enabled */ -static int ftrace_function_enabled; +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static struct tracer_flags func_flags; + +/* Our option */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static int allocate_ftrace_ops(struct trace_array *tr) +{ + struct ftrace_ops *ops; + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; -static struct trace_array *func_trace; + /* Currently only the non stack verision is supported */ + ops->func = function_trace_call; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; -static void tracing_start_function_trace(void); -static void tracing_stop_function_trace(void); + tr->ops = ops; + ops->private = tr; + return 0; +} + + +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + int ret; + + /* + * The top level array uses the "global_ops", and the files are + * created on boot up. + */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; + + ftrace_create_filter_files(tr->ops, parent); + + return 0; +} + +void ftrace_destroy_function_files(struct trace_array *tr) +{ + ftrace_destroy_filter_files(tr->ops); + kfree(tr->ops); + tr->ops = NULL; +} static int function_trace_init(struct trace_array *tr) { - func_trace = tr; - tr->cpu = get_cpu(); + ftrace_func_t func; + + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) + return -ENOMEM; + + /* Currently only the global instance can do stack tracing */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && + func_flags.val & TRACE_FUNC_OPT_STACK) + func = function_stack_trace_call; + else + func = function_trace_call; + + ftrace_init_array_ops(tr, func); + + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); - tracing_start_function_trace(); + tracing_start_function_trace(tr); return 0; } static void function_trace_reset(struct trace_array *tr) { - tracing_stop_function_trace(); + tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); } static void function_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; - long disabled; + int bit; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; pc = preempt_count(); preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; -static void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } + trace_clear_recursion(bit); - atomic_dec(&data->disabled); - local_irq_restore(flags); + out: + preempt_enable_notrace(); } static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; long disabled; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; /* @@ -124,7 +173,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) */ local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { @@ -145,24 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ - .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; - -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -175,129 +206,159 @@ static struct tracer_flags func_flags = { .opts = func_opts }; -static void tracing_start_function_trace(void) +static void tracing_start_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - register_ftrace_function(&trace_stack_ops); - else - register_ftrace_function(&trace_ops); - - ftrace_function_enabled = 1; + tr->function_enabled = 0; + register_ftrace_function(tr->ops); + tr->function_enabled = 1; } -static void tracing_stop_function_trace(void) +static void tracing_stop_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - unregister_ftrace_function(&trace_stack_ops); - else - unregister_ftrace_function(&trace_ops); + tr->function_enabled = 0; + unregister_ftrace_function(tr->ops); } -static int func_set_flag(u32 old_flags, u32 bit, int set) +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; + + unregister_ftrace_function(tr->ops); if (set) { - unregister_ftrace_function(&trace_ops); - register_ftrace_function(&trace_stack_ops); + tr->ops->func = function_stack_trace_call; + register_ftrace_function(tr->ops); } else { - unregister_ftrace_function(&trace_stack_ops); - register_ftrace_function(&trace_ops); + tr->ops->func = function_trace_call; + register_ftrace_function(tr->ops); } - return 0; + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } -static struct tracer function_trace __read_mostly = +static struct tracer function_trace __tracer_data = { .name = "function", .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, + .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif }; #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data) { - long *count = (long *)data; - - if (tracing_is_on()) - return; + unsigned long *count = (long *)data; if (!*count) - return; + return 0; if (*count != -1) (*count)--; - tracing_on(); + return 1; } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { - long *count = (long *)data; + if (tracing_is_on()) + return; + + if (update_count(data)) + tracing_on(); +} +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{ if (!tracing_is_on()) return; - if (!*count) + if (update_count(data)) + tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (tracing_is_on()) return; - if (*count != -1) - (*count)--; + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; tracing_off(); } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + * ftrace_stacktrace() + * function_trace_probe_call() + * ftrace_ops_list_func() + * ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { - .func = ftrace_traceon, - .print = ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ + trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { - .func = ftrace_traceoff, - .print = ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; + + if (update_count(data)) + trace_dump_stack(STACK_SKIP); +} + +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, + unsigned long ip, void *data) { long count = (long)data; - seq_printf(m, "%ps:", (void *)ip); - - if (ops == &traceon_probe_ops) - seq_printf(m, "traceon"); - else - seq_printf(m, "traceoff"); + seq_printf(m, "%ps:%s", (void *)ip, name); if (count == -1) seq_printf(m, ":unlimited\n"); @@ -308,26 +369,85 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, } static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) { - struct ftrace_probe_ops *ops; + return ftrace_probe_print("traceon", m, ip, data); +} - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceoff", m, ip, data); +} - unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("stacktrace", m, ip, data); +} - return 0; +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); } static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + +static struct ftrace_probe_ops traceon_count_probe_ops = { + .func = ftrace_traceon_count, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { + .func = ftrace_traceoff_count, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { + .func = ftrace_stacktrace_count, + .print = ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { + .func = ftrace_traceon, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { + .func = ftrace_traceoff, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { + .func = ftrace_stacktrace, + .print = ftrace_stacktrace_print, +}; + +static int +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, + struct ftrace_hash *hash, char *glob, + char *cmd, char *param, int enable) { - struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; @@ -336,14 +456,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, if (!enable) return -EINVAL; - if (glob[0] == '!') - return ftrace_trace_onoff_unreg(glob+1, cmd, param); - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } if (!param) goto out_reg; @@ -357,7 +473,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, * We use the callback data field (which is a pointer) * as our counter. */ - ret = strict_strtoul(number, 0, (unsigned long *)&count); + ret = kstrtoul(number, 0, (unsigned long *)&count); if (ret) return ret; @@ -367,6 +483,60 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, return ret < 0 ? ret : 0; } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; + else + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -377,6 +547,21 @@ static struct ftrace_func_command ftrace_traceoff_cmd = { .func = ftrace_trace_onoff_callback, }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { + .name = "stacktrace", + .func = ftrace_stacktrace_callback, +}; + +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -387,7 +572,31 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); + goto out_free_traceoff; + + ret = register_ftrace_command(&ftrace_stacktrace_cmd); + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + + return 0; + + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; } #else @@ -402,5 +611,4 @@ static __init int init_function_trace(void) init_func_cmd_traceon(); return register_tracer(&function_trace); } -device_initcall(init_function_trace); - +core_initcall(init_function_trace); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d..4de3e57f723 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,14 +38,7 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN 0x1 -#define TRACE_GRAPH_PRINT_CPU 0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 -#define TRACE_GRAPH_PRINT_PROC 0x8 -#define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -62,11 +55,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, /* Display interrupts */ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns and proc by default */ + /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts @@ -80,9 +75,9 @@ static struct trace_array *graph_array; * to fill in space into DURATION column. */ enum { - DURATION_FILL_FULL = -1, - DURATION_FILL_START = -2, - DURATION_FILL_END = -3, + FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; static enum print_line_t @@ -112,16 +107,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return -EBUSY; } + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in debugfs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + calltime = trace_clock_local(); index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; current->ret_stack[index].fp = frame_pointer; - *depth = index; + *depth = current->curr_ret_stack; return 0; } @@ -135,7 +151,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - if (unlikely(index < 0)) { + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { ftrace_graph_stop(); WARN_ON(1); /* Might as well panic, otherwise we have no where to go */ @@ -143,7 +169,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -154,6 +180,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, * * Currently, x86_32 with optimize for size (-Os) makes the latest * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. */ if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); @@ -186,9 +215,24 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } + + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); if (unlikely(!ret)) { ftrace_graph_stop(); @@ -207,7 +251,7 @@ int __trace_graph_entry(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -219,8 +263,8 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); return 1; } @@ -247,13 +291,24 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || (trace->depth < 0) || + (max_depth && trace->depth >= max_depth)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -311,7 +366,7 @@ void __trace_graph_return(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -323,8 +378,8 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) - ring_buffer_unlock_commit(buffer, event); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -338,7 +393,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -434,7 +489,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* First spaces to align center */ for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -445,7 +500,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* Last spaces to align center */ for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -491,7 +546,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, " ------------------------------------------\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -504,7 +559,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " => "); + ret = trace_seq_puts(s, " => "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -512,7 +567,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, "\n ------------------------------------------\n\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -538,7 +593,7 @@ get_return_for_leaf(struct trace_iterator *iter, next = &data->ret; } else { - ring_iter = iter->buffer_iter[iter->cpu]; + ring_iter = trace_buffer_iter(iter, iter->cpu); /* First peek to compare current entry and the next one */ if (ring_iter) @@ -548,9 +603,9 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); } @@ -633,30 +688,30 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } } /* No overhead */ - ret = print_graph_duration(DURATION_FILL_START, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); if (ret != TRACE_TYPE_HANDLED) return ret; if (type == TRACE_GRAPH_ENT) - ret = trace_seq_printf(s, "==========>"); + ret = trace_seq_puts(s, "==========>"); else - ret = trace_seq_printf(s, "<=========="); + ret = trace_seq_puts(s, "<=========="); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = print_graph_duration(DURATION_FILL_END, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -693,13 +748,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) len += strlen(nsecs_str); } - ret = trace_seq_printf(s, " us "); + ret = trace_seq_puts(s, " us "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Print remaining spaces to fit the row's width */ for (i = len; i < 7; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -717,15 +772,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ - switch (duration) { - case DURATION_FILL_FULL: - ret = trace_seq_printf(s, " | "); + switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { + case FLAGS_FILL_FULL: + ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_START: - ret = trace_seq_printf(s, " "); + case FLAGS_FILL_START: + ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_END: - ret = trace_seq_printf(s, " |"); + case FLAGS_FILL_END: + ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -733,10 +788,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { /* Duration exceeded 100 msecs */ if (duration > 100000ULL) - ret = trace_seq_printf(s, "! "); + ret = trace_seq_puts(s, "! "); /* Duration exceeded 10 msecs */ else if (duration > 10000ULL) - ret = trace_seq_printf(s, "+ "); + ret = trace_seq_puts(s, "+ "); } /* @@ -745,7 +800,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, * to fill out the space. */ if (ret == -1) - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); /* Catching here any failure happenned above */ if (!ret) @@ -755,7 +810,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "| "); + ret = trace_seq_puts(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -805,7 +860,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -840,13 +895,13 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -905,7 +960,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1105,7 +1160,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1114,10 +1169,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * If the return function does not have a matching entry, * then the entry was lost. Instead of just printing * the '}' and letting the user guess what function this - * belongs to, write out the function name. + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. */ - if (func_match) { - ret = trace_seq_printf(s, "}\n"); + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { + ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { @@ -1160,20 +1216,20 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return TRACE_TYPE_PARTIAL_LINE; /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; /* Indentation */ if (depth > 0) for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* The comment */ - ret = trace_seq_printf(s, "/* "); + ret = trace_seq_puts(s, "/* "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1204,7 +1260,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, s->len--; } - ret = trace_seq_printf(s, " */\n"); + ret = trace_seq_puts(s, " */\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1414,7 +1470,8 @@ void graph_trace_close(struct trace_iterator *iter) } } -static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; @@ -1436,13 +1493,12 @@ static struct trace_event graph_trace_ret_event = { .funcs = &graph_functions }; -static struct tracer graph_trace __read_mostly = { +static struct tracer graph_trace __tracer_data = { .name = "function_graph", .open = graph_trace_open, .pipe_open = graph_trace_open, .close = graph_trace_close, .pipe_close = graph_trace_close, - .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, @@ -1454,6 +1510,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); @@ -1471,4 +1580,4 @@ static __init int init_graph_trace(void) return register_tracer(&graph_trace); } -device_initcall(init_graph_trace); +core_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e92036..9bb104f748d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -7,7 +7,7 @@ * From code in the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/kallsyms.h> #include <linux/debugfs.h> @@ -32,7 +32,8 @@ enum { static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; +static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr, if (!irqs_disabled_flags(*flags)) return 0; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) @@ -136,7 +137,8 @@ static int func_prolog_dec(struct trace_array *tr, * irqsoff uses its own tracer function to keep the overhead down: */ static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; @@ -149,16 +151,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) atomic_dec(&data->disabled); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { int cpu; @@ -173,8 +170,8 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) for_each_possible_cpu(cpu) per_cpu(tracing_cpu, cpu) = 0; - tracing_max_latency = 0; - tracing_reset_online_cpus(irqsoff_trace); + tr->max_latency = 0; + tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); } @@ -264,7 +261,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } @@ -299,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -329,13 +327,13 @@ check_critical_timing(struct trace_array *tr, pc = preempt_count(); - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out; raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out_unlock; __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -348,7 +346,7 @@ check_critical_timing(struct trace_array *tr, data->critical_end = parent_ip; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + tr->max_latency = delta; update_max_tr_single(tr, current, cpu); } @@ -371,7 +369,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + if (!tracer_enabled || !tracing_is_enabled()) return; cpu = raw_smp_processor_id(); @@ -379,7 +377,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (per_cpu(tracing_cpu, cpu)) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || atomic_read(&data->disabled)) return; @@ -414,10 +412,10 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) else return; - if (!tracer_enabled) + if (!tracer_enabled || !tracing_is_enabled()) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) @@ -496,14 +494,14 @@ void trace_hardirqs_off(void) } EXPORT_SYMBOL(trace_hardirqs_off); -void trace_hardirqs_on_caller(unsigned long caller_addr) +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void trace_hardirqs_off_caller(unsigned long caller_addr) +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); @@ -527,15 +525,62 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { - int ret = 0; + int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); + else + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_irqsoff_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static void irqsoff_function_set(struct trace_array *tr, int set) +{ + if (set) + register_irqsoff_function(tr, is_graph(), 1); + else + unregister_irqsoff_function(tr, is_graph()); +} + +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +{ + struct tracer *tracer = tr->current_trace; + + if (mask & TRACE_ITER_FUNCTION) + irqsoff_function_set(tr, set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_irqsoff_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -549,33 +594,51 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_irqsoff_function(tr, graph); } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + if (irqsoff_busy) + return -EBUSY; + + save_flags = trace_flags; - tracing_max_latency = 0; + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + + tr->max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); + + ftrace_init_array_ops(tr, irqsoff_tracer_call); - if (start_irqsoff_tracer(tr, is_graph())) + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; } static void irqsoff_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_irqsoff_tracer(tr, is_graph()); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + + irqsoff_busy = false; } static void irqsoff_tracer_start(struct trace_array *tr) @@ -593,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer irqsoff_tracer __read_mostly = { @@ -603,17 +665,19 @@ static struct tracer irqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .allow_instances = true, + .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -625,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptoff_tracer __read_mostly = @@ -636,17 +699,19 @@ static struct tracer preemptoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .allow_instances = true, + .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -660,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -671,17 +735,19 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = irqsoff_print_header, .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, - .use_max_tr = 1, + .allow_instances = true, + .use_max_tr = true, }; # define register_preemptirqsoff(trace) register_tracer(&trace) @@ -697,4 +763,4 @@ __init static int init_irqsoff_tracer(void) return 0; } -device_initcall(init_irqsoff_tracer); +core_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b..bd90e1b0608 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -83,7 +83,7 @@ out: trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv) !cpu_online(cpu_file)) return KDB_BADINT; } else { - cpu_file = TRACE_PIPE_ALL_CPU; + cpu_file = RING_BUFFER_ALL_CPUS; } kdb_trap_printk++; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a..282f6e4e553 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -19,185 +19,134 @@ #include <linux/module.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/smp.h> -#include <linux/debugfs.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/stringify.h> -#include <linux/limits.h> -#include <asm/bitsperlong.h> - -#include "trace.h" -#include "trace_output.h" - -#define MAX_TRACE_ARGS 128 -#define MAX_ARGSTR_LEN 63 -#define MAX_EVENT_NAME_LEN 64 -#define MAX_STRING_SIZE PATH_MAX + +#include "trace_probe.h" + #define KPROBE_EVENT_SYSTEM "kprobes" -/* Reserved field names */ -#define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_RETIP "__probe_ret_ip" -#define FIELD_STRING_FUNC "__probe_func" - -const char *reserved_field_names[] = { - "common_type", - "common_flags", - "common_preempt_count", - "common_pid", - "common_tgid", - FIELD_STRING_IP, - FIELD_STRING_RETIP, - FIELD_STRING_FUNC, +/** + * Kprobe event core functions + */ +struct trace_kprobe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + const char *symbol; /* symbol name */ + struct trace_probe tp; }; -/* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, - void *); -#define PRINT_TYPE_FUNC_NAME(type) print_type_##type -#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type - -/* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent)\ -{ \ - return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ -} \ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -static inline void *get_rloc_data(u32 *dl) +#define SIZEOF_TRACE_KPROBE(n) \ + (offsetof(struct trace_kprobe, tp.args) + \ + (sizeof(struct probe_arg) * (n))) + + +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { - return (u8 *)dl + get_rloc_offs(*dl); + return tk->rp.handler != NULL; } -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) { - return (u8 *)ent + get_rloc_offs(*dl); + return tk->symbol ? tk->symbol : "unknown"; } -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +{ + return tk->rp.kp.offset; +} -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) +{ + return !!(kprobe_gone(&tk->rp.kp)); +} -/* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, - const char *name, - void *data, void *ent) +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, + struct module *mod) { - int len = *(u32 *)data >> 16; + int len = strlen(mod->name); + const char *name = trace_kprobe_symbol(tk); + return strncmp(mod->name, name, len) == 0 && name[len] == ':'; +} - if (!len) - return trace_seq_printf(s, " %s=(fault)", name); - else - return trace_seq_printf(s, " %s=\"%s\"", name, - (const char *)get_loc_data(data, ent)); +static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +{ + return !!strchr(trace_kprobe_symbol(tk), ':'); } -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); -struct fetch_param { - fetch_func_t fn; - void *data; +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; }; -static __kprobes void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) +unsigned long update_symbol_cache(struct symbol_cache *sc) { - return fprm->fn(regs, fprm->data, dest); + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + + if (sc->addr) + sc->addr += sc->offset; + + return sc->addr; } -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) - -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ +void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + update_symbol_cache(sc); + + return sc; } -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL +/* + * Kprobes-specific fetch functions + */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); + DEFINE_BASIC_FETCH_FUNCS(stack) /* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ void *addr, void *dest) \ { \ type retval; \ @@ -205,31 +154,37 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ else \ *(type *)dest = retval; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); + DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); u8 *src = addr; mm_segment_t old_fs = get_fs(); + if (!maxlen) return; + /* * Try to get string again, since the string can be changed while * probing. */ set_fs(KERNEL_DS); pagefault_disable(); + do ret = __copy_from_user_inatomic(dst++, src++, 1); while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; pagefault_enable(); set_fs(old_fs); @@ -237,24 +192,30 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else + } else { *(u32 *)dest = make_data_rloc(src - (u8 *)addr, get_rloc_offs(*(u32 *)dest)); + } } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); + /* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { + mm_segment_t old_fs; int ret, len = 0; u8 c; - mm_segment_t old_fs = get_fs(); + old_fs = get_fs(); set_fs(KERNEL_DS); pagefault_disable(); + do { ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); set_fs(old_fs); @@ -263,210 +224,33 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, else *(u32 *)dest = len; } - -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - if (sc->addr) - sc->addr += sc->offset; - return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - - update_symbol_cache(sc); - return sc; -} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); #define DEFINE_FETCH_symbol(type) \ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ - void *data, void *dest) \ +void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ { \ struct symbol_cache *sc = data; \ if (sc->addr) \ fetch_memory_##type(regs, (void *)sc->addr, dest); \ else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); + DEFINE_BASIC_FETCH_FUNCS(symbol) DEFINE_FETCH_symbol(string) DEFINE_FETCH_symbol(string_size) -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; -}; - -#define DEFINE_FETCH_deref(type) \ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - fetch_memory_##type(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) - -static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; - -#define DEFINE_FETCH_bitfield(type) \ -static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static __kprobes void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static __kprobes void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} - -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_END, -}; - -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ - .size = _size, \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ - .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ - } \ - } - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 +/* kprobes don't support file_offset fetch methods */ +#define fetch_file_offset_u8 NULL +#define fetch_file_offset_u16 NULL +#define fetch_file_offset_u32 NULL +#define fetch_file_offset_u64 NULL +#define fetch_file_offset_string NULL +#define fetch_file_offset_string_size NULL /* Fetch type information table */ -static const struct fetch_type { - const char *name; /* Name of type */ - size_t size; /* Byte size of type */ - int is_signed; /* Signed flag */ - print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ - const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -} fetch_type_table[] = { +const struct fetch_type kprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -481,207 +265,49 @@ static const struct fetch_type { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) -{ - int i; - if (!type) - type = DEFAULT_FETCH_TYPE_STR; - - /* Special case: bitfield */ - if (*type == 'b') { - unsigned long bs; - type = strchr(type, '/'); - if (!type) - goto fail; - type++; - if (strict_strtoul(type, 0, &bs)) - goto fail; - switch (bs) { - case 8: - return find_fetch_type("u8"); - case 16: - return find_fetch_type("u16"); - case 32: - return find_fetch_type("u32"); - case 64: - return find_fetch_type("u64"); - default: - goto fail; - } - } - - for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) - if (strcmp(type, fetch_type_table[i].name) == 0) - return &fetch_type_table[i]; -fail: - return NULL; -} - -/* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, - void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn) -{ - int i; - - if (type != &fetch_type_table[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - return NULL; -} - -/** - * Kprobe event core functions - */ - -struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; - unsigned int offset; /* Offset from argument entry */ - const char *name; /* Name of this argument */ - const char *comm; /* Command of this argument */ - const struct fetch_type *type; /* Type of this argument */ + ASSIGN_FETCH_TYPE_END }; -/* Flags for trace_probe */ -#define TP_FLAG_TRACE 1 -#define TP_FLAG_PROFILE 2 -#define TP_FLAG_REGISTERED 4 - -struct trace_probe { - struct list_head list; - struct kretprobe rp; /* Use rp.kp for kprobe use */ - unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ - const char *symbol; /* symbol name */ - struct ftrace_event_class class; - struct ftrace_event_call call; - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; -}; - -#define SIZEOF_TRACE_PROBE(n) \ - (offsetof(struct trace_probe, args) + \ - (sizeof(struct probe_arg) * (n))) - - -static __kprobes int trace_probe_is_return(struct trace_probe *tp) -{ - return tp->rp.handler != NULL; -} - -static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) -{ - return tp->symbol ? tp->symbol : "unknown"; -} - -static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) -{ - return tp->rp.kp.offset; -} - -static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) -{ - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); -} - -static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) -{ - return !!(tp->flags & TP_FLAG_REGISTERED); -} - -static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) -{ - return !!(kprobe_gone(&tp->rp.kp)); -} - -static __kprobes bool trace_probe_within_module(struct trace_probe *tp, - struct module *mod) -{ - int len = strlen(mod->name); - const char *name = trace_probe_symbol(tp); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; -} - -static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) -{ - return !!strchr(trace_probe_symbol(tp), ':'); -} - -static int register_probe_event(struct trace_probe *tp); -static void unregister_probe_event(struct trace_probe *tp); - -static DEFINE_MUTEX(probe_lock); -static LIST_HEAD(probe_list); - -static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); -static int kretprobe_dispatcher(struct kretprobe_instance *ri, - struct pt_regs *regs); - -/* Check the name is good for event/group/fields */ -static int is_good_name(const char *name) -{ - if (!isalpha(*name) && *name != '_') - return 0; - while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return 0; - } - return 1; -} - /* * Allocate new trace_probe and initialize it (including kprobes). */ -static struct trace_probe *alloc_trace_probe(const char *group, +static struct trace_kprobe *alloc_trace_kprobe(const char *group, const char *event, void *addr, const char *symbol, unsigned long offs, - int nargs, int is_return) + int nargs, bool is_return) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = -ENOMEM; - tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); - if (!tp) + tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + if (!tk) return ERR_PTR(ret); if (symbol) { - tp->symbol = kstrdup(symbol, GFP_KERNEL); - if (!tp->symbol) + tk->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tk->symbol) goto error; - tp->rp.kp.symbol_name = tp->symbol; - tp->rp.kp.offset = offs; + tk->rp.kp.symbol_name = tk->symbol; + tk->rp.kp.offset = offs; } else - tp->rp.kp.addr = addr; + tk->rp.kp.addr = addr; if (is_return) - tp->rp.handler = kretprobe_dispatcher; + tk->rp.handler = kretprobe_dispatcher; else - tp->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.kp.pre_handler = kprobe_dispatcher; if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } - tp->call.class = &tp->class; - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) + tk->tp.call.class = &tk->tp.class; + tk->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tk->tp.call.name) goto error; if (!group || !is_good_name(group)) { @@ -689,130 +315,166 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) + tk->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tk->tp.class.system) goto error; - INIT_LIST_HEAD(&tp->list); - return tp; + INIT_LIST_HEAD(&tk->list); + INIT_LIST_HEAD(&tk->tp.files); + return tk; error: - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); return ERR_PTR(ret); } -static void update_probe_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - -static void free_probe_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); - kfree(arg->name); - kfree(arg->comm); -} - -static void free_trace_probe(struct trace_probe *tp) +static void free_trace_kprobe(struct trace_kprobe *tk) { int i; - for (i = 0; i < tp->nr_args; i++) - free_probe_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_free_probe_arg(&tk->tp.args[i]); - kfree(tp->call.class->system); - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.class->system); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); } -static struct trace_probe *find_trace_probe(const char *event, - const char *group) +static struct trace_kprobe *find_trace_kprobe(const char *event, + const char *group) { - struct trace_probe *tp; + struct trace_kprobe *tk; - list_for_each_entry(tp, &probe_list, list) - if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.class->system, group) == 0) - return tp; + list_for_each_entry(tk, &probe_list, list) + if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && + strcmp(tk->tp.call.class->system, group) == 0) + return tk; return NULL; } -/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static int enable_trace_probe(struct trace_probe *tp, int flag) +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int +enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { int ret = 0; - tp->flags |= flag; - if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && - !trace_probe_has_gone(tp)) { - if (trace_probe_is_return(tp)) - ret = enable_kretprobe(&tp->rp); + if (file) { + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + ret = -ENOMEM; + goto out; + } + + link->file = file; + list_add_tail_rcu(&link->list, &tk->tp.files); + + tk->tp.flags |= TP_FLAG_TRACE; + } else + tk->tp.flags |= TP_FLAG_PROFILE; + + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); else - ret = enable_kprobe(&tp->rp.kp); + ret = enable_kprobe(&tk->rp.kp); } - + out: return ret; } -/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ -static void disable_trace_probe(struct trace_probe *tp, int flag) +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int +disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { - tp->flags &= ~flag; - if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); + struct event_file_link *link = NULL; + int wait = 0; + int ret = 0; + + if (file) { + link = find_event_file_link(&tk->tp, file); + if (!link) { + ret = -EINVAL; + goto out; + } + + list_del_rcu(&link->list); + wait = 1; + if (!list_empty(&tk->tp.files)) + goto out; + + tk->tp.flags &= ~TP_FLAG_TRACE; + } else + tk->tp.flags &= ~TP_FLAG_PROFILE; + + if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); else - disable_kprobe(&tp->rp.kp); + disable_kprobe(&tk->rp.kp); + wait = 1; } + out: + if (wait) { + /* + * Synchronize with kprobe_trace_func/kretprobe_trace_func + * to ensure disabled (all running handlers are finished). + * This is not only for kfree(), but also the caller, + * trace_remove_event_call() supposes it for releasing + * event_call related objects, which will be accessed in + * the kprobe_trace_func/kretprobe_trace_func. + */ + synchronize_sched(); + kfree(link); /* Ignored if link == NULL */ + } + + return ret; } /* Internal register function - just handle k*probes and flags */ -static int __register_trace_probe(struct trace_probe *tp) +static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; - if (trace_probe_is_registered(tp)) + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; - for (i = 0; i < tp->nr_args; i++) - update_probe_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_update_arg(&tk->tp.args[i]); /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(tp)) - tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + if (trace_probe_is_enabled(&tk->tp)) + tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; else - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); + if (trace_kprobe_is_return(tk)) + ret = register_kretprobe(&tk->rp); else - ret = register_kprobe(&tp->rp.kp); + ret = register_kprobe(&tk->rp.kp); if (ret == 0) - tp->flags |= TP_FLAG_REGISTERED; + tk->tp.flags |= TP_FLAG_REGISTERED; else { pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_probe_symbol(tp), trace_probe_offset(tp), ret); - if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warning("This probe might be able to register after" "target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { pr_warning("Probing address(0x%p) is not an " "instruction boundary.\n", - tp->rp.kp.addr); + tk->rp.kp.addr); ret = -EINVAL; } } @@ -821,64 +483,68 @@ static int __register_trace_probe(struct trace_probe *tp) } /* Internal unregister function - just handle k*probes and flags */ -static void __unregister_trace_probe(struct trace_probe *tp) +static void __unregister_trace_kprobe(struct trace_kprobe *tk) { - if (trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + unregister_kretprobe(&tk->rp); else - unregister_kprobe(&tp->rp.kp); - tp->flags &= ~TP_FLAG_REGISTERED; + unregister_kprobe(&tk->rp.kp); + tk->tp.flags &= ~TP_FLAG_REGISTERED; /* Cleanup kprobe for reuse */ - if (tp->rp.kp.symbol_name) - tp->rp.kp.addr = NULL; + if (tk->rp.kp.symbol_name) + tk->rp.kp.addr = NULL; } } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static int unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_kprobe(struct trace_kprobe *tk) { /* Enabled event can not be unregistered */ - if (trace_probe_is_enabled(tp)) + if (trace_probe_is_enabled(&tk->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_kprobe_event(tk)) return -EBUSY; - __unregister_trace_probe(tp); - list_del(&tp->list); - unregister_probe_event(tp); + __unregister_trace_kprobe(tk); + list_del(&tk->list); return 0; } /* Register a trace_probe and probe_event */ -static int register_trace_probe(struct trace_probe *tp) +static int register_trace_kprobe(struct trace_kprobe *tk) { - struct trace_probe *old_tp; + struct trace_kprobe *old_tk; int ret; mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tp = find_trace_probe(tp->call.name, tp->call.class->system); - if (old_tp) { - ret = unregister_trace_probe(old_tp); + old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + tk->tp.call.class->system); + if (old_tk) { + ret = unregister_trace_kprobe(old_tk); if (ret < 0) goto end; - free_trace_probe(old_tp); + free_trace_kprobe(old_tk); } /* Register new event */ - ret = register_probe_event(tp); + ret = register_kprobe_event(tk); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } /* Register k*probe */ - ret = __register_trace_probe(tp); + ret = __register_trace_kprobe(tk); if (ret < 0) - unregister_probe_event(tp); + unregister_kprobe_event(tk); else - list_add_tail(&tp->list, &probe_list); + list_add_tail(&tk->list, &probe_list); end: mutex_unlock(&probe_lock); @@ -886,11 +552,11 @@ end: } /* Module notifier call back, checking event on the module */ -static int trace_probe_module_callback(struct notifier_block *nb, +static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; - struct trace_probe *tp; + struct trace_kprobe *tk; int ret; if (val != MODULE_STATE_COMING) @@ -898,15 +564,16 @@ static int trace_probe_module_callback(struct notifier_block *nb, /* Update probes on coming module */ mutex_lock(&probe_lock); - list_for_each_entry(tp, &probe_list, list) { - if (trace_probe_within_module(tp, mod)) { + list_for_each_entry(tk, &probe_list, list) { + if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ - __unregister_trace_probe(tp); - ret = __register_trace_probe(tp); + __unregister_trace_kprobe(tk); + ret = __register_trace_kprobe(tk); if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tp->call.name, mod->name, ret); + ftrace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -914,233 +581,12 @@ static int trace_probe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; } -static struct notifier_block trace_probe_module_nb = { - .notifier_call = trace_probe_module_callback, +static struct notifier_block trace_kprobe_module_nb = { + .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; -/* Split symbol and offset. */ -static int split_symbol_offset(char *symbol, unsigned long *offset) -{ - char *tmp; - int ret; - - if (!offset) - return -EINVAL; - - tmp = strchr(symbol, '+'); - if (tmp) { - /* skip sign because strict_strtol doesn't accept '+' */ - ret = strict_strtoul(tmp + 1, 0, offset); - if (ret) - return ret; - *tmp = '\0'; - } else - *offset = 0; - return 0; -} - -#define PARAM_MAX_ARGS 16 -#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) - -static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, int is_return) -{ - int ret = 0; - unsigned long param; - - if (strcmp(arg, "retval") == 0) { - if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; - else - ret = -EINVAL; - } else if (strncmp(arg, "stack", 5) == 0) { - if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) - f->fn = fetch_stack_address; - else - ret = -EINVAL; - } else if (isdigit(arg[5])) { - ret = strict_strtoul(arg + 5, 10, ¶m); - if (ret || param > PARAM_MAX_STACK) - ret = -EINVAL; - else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; - } - } else - ret = -EINVAL; - } else - ret = -EINVAL; - return ret; -} - -/* Recursive argument parser */ -static int __parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, int is_return) -{ - int ret = 0; - unsigned long param; - long offset; - char *tmp; - - switch (arg[0]) { - case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return); - break; - case '%': /* named register */ - ret = regs_query_register_offset(arg + 1); - if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; - ret = 0; - } - break; - case '@': /* memory or symbol */ - if (isdigit(arg[1])) { - ret = strict_strtoul(arg + 1, 0, ¶m); - if (ret) - break; - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; - } else { - ret = split_symbol_offset(arg + 1, &offset); - if (ret) - break; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; - } - break; - case '+': /* deref memory */ - arg++; /* Skip '+', because strict_strtol() rejects it. */ - case '-': - tmp = strchr(arg, '('); - if (!tmp) - break; - *tmp = '\0'; - ret = strict_strtol(arg, 0, &offset); - if (ret) - break; - arg = tmp + 1; - tmp = strrchr(arg, ')'); - if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2 = find_fetch_type(NULL); - *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), - GFP_KERNEL); - if (!dprm) - return -ENOMEM; - dprm->offset = offset; - ret = __parse_probe_arg(arg, t2, &dprm->orig, - is_return); - if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } - } - break; - } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", - t->name); - ret = -EINVAL; - } - return ret; -} - -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - -/* Bitfield type needs to be parsed into a fetch function */ -static int __parse_bitfield_probe_arg(const char *bf, - const struct fetch_type *t, - struct fetch_param *f) -{ - struct bitfield_fetch_param *bprm; - unsigned long bw, bo; - char *tail; - - if (*bf != 'b') - return 0; - - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; - - bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ - if (bw == 0 || *tail != '@') - return -EINVAL; - - bf = tail + 1; - bo = simple_strtoul(bf, &tail, 0); - if (tail == bf || *tail != '/') - return -EINVAL; - - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; - return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; -} - -/* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct trace_probe *tp, - struct probe_arg *parg, int is_return) -{ - const char *t; - int ret; - - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; - } - parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); - return -ENOMEM; - } - t = strchr(parg->comm, ':'); - if (t) { - arg[t - parg->comm] = '\0'; - t++; - } - parg->type = find_fetch_type(t); - if (!parg->type) { - pr_info("Unsupported type: %s\n", t); - return -EINVAL; - } - parg->offset = tp->size; - tp->size += parg->type->size; - ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn); - parg->fetch_size.data = parg->fetch.data; - } - return ret; -} - -/* Return 1 if name is reserved or already used by another argument */ -static int conflict_field_name(const char *name, - struct probe_arg *args, int narg) -{ - int i; - for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) - if (strcmp(reserved_field_names[i], name) == 0) - return 1; - for (i = 0; i < narg; i++) - if (strcmp(args[i].name, name) == 0) - return 1; - return 0; -} - -static int create_trace_probe(int argc, char **argv) +static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: @@ -1160,9 +606,9 @@ static int create_trace_probe(int argc, char **argv) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_probe *tp; + struct trace_kprobe *tk; int i, ret = 0; - int is_return = 0, is_delete = 0; + bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; char *arg; unsigned long offset = 0; @@ -1171,11 +617,11 @@ static int create_trace_probe(int argc, char **argv) /* argc must be >= 1 */ if (argv[0][0] == 'p') - is_return = 0; + is_return = false; else if (argv[0][0] == 'r') - is_return = 1; + is_return = true; else if (argv[0][0] == '-') - is_delete = 1; + is_delete = true; else { pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); @@ -1207,16 +653,16 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_trace_probe(event, group); - if (!tp) { + tk = find_trace_kprobe(event, group); + if (!tk) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ - ret = unregister_trace_probe(tp); + ret = unregister_trace_kprobe(tk); if (ret == 0) - free_trace_probe(tp); + free_trace_kprobe(tk); mutex_unlock(&probe_lock); return ret; } @@ -1231,7 +677,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); + ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; @@ -1240,7 +686,7 @@ static int create_trace_probe(int argc, char **argv) /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ - ret = split_symbol_offset(symbol, &offset); + ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { pr_info("Failed to parse symbol.\n"); return ret; @@ -1263,46 +709,49 @@ static int create_trace_probe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, is_return); - if (IS_ERR(tp)) { + if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tp)); - return PTR_ERR(tp); + (int)PTR_ERR(tk)); + return PTR_ERR(tk); } /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + /* Increment count for freeing args in error case */ - tp->nr_args++; + tk->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tp->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tp->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tp->args[i].name)) { + if (!is_good_name(parg->name)) { pr_info("Invalid argument[%d] name: %s\n", - i, tp->args[i].name); + i, parg->name); ret = -EINVAL; goto error; } - if (conflict_field_name(tp->args[i].name, tp->args, i)) { + if (traceprobe_conflict_field_name(parg->name, + tk->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -1310,40 +759,43 @@ static int create_trace_probe(int argc, char **argv) } /* Parse fetch argument */ - ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); + ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, + is_return, true); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; } } - ret = register_trace_probe(tp); + ret = register_trace_kprobe(tk); if (ret) goto error; return 0; error: - free_trace_probe(tp); + free_trace_kprobe(tk); return ret; } -static int release_all_trace_probes(void) +static int release_all_trace_kprobes(void) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = 0; mutex_lock(&probe_lock); /* Ensure no probe is in use. */ - list_for_each_entry(tp, &probe_list, list) - if (trace_probe_is_enabled(tp)) { + list_for_each_entry(tk, &probe_list, list) + if (trace_probe_is_enabled(&tk->tp)) { ret = -EBUSY; goto end; } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { - tp = list_entry(probe_list.next, struct trace_probe, list); - unregister_trace_probe(tp); - free_trace_probe(tp); + tk = list_entry(probe_list.next, struct trace_kprobe, list); + ret = unregister_trace_kprobe(tk); + if (ret) + goto end; + free_trace_kprobe(tk); } end: @@ -1371,22 +823,23 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); + seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, + ftrace_event_name(&tk->tp.call)); - if (!tp->symbol) - seq_printf(m, " 0x%p", tp->rp.kp.addr); - else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", trace_probe_symbol(tp), - tp->rp.kp.offset); + if (!tk->symbol) + seq_printf(m, " 0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), + tk->rp.kp.offset); else - seq_printf(m, " %s", trace_probe_symbol(tp)); + seq_printf(m, " %s", trace_kprobe_symbol(tk)); - for (i = 0; i < tp->nr_args; i++) - seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); + for (i = 0; i < tk->tp.nr_args; i++) + seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -1404,7 +857,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_probes(); + ret = release_all_trace_kprobes(); if (ret < 0) return ret; } @@ -1412,70 +865,11 @@ static int probes_open(struct inode *inode, struct file *file) return seq_open(file, &probes_seq_op); } -static int command_trace_probe(const char *buf) -{ - char **argv; - int argc = 0, ret = 0; - - argv = argv_split(GFP_KERNEL, buf, &argc); - if (!argv) - return -ENOMEM; - - if (argc) - ret = create_trace_probe(argc, argv); - - argv_free(argv); - return ret; -} - -#define WRITE_BUFSIZE 4096 - static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char *kbuf, *tmp; - int ret; - size_t done; - size_t size; - - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - ret = done = 0; - while (done < count) { - size = count - done; - if (size >= WRITE_BUFSIZE) - size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } - kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); - if (tmp) - *tmp = '\0'; - - ret = command_trace_probe(kbuf); - if (ret) - goto out; - } - ret = done; -out: - kfree(kbuf); - return ret; + return traceprobe_probes_write(file, buffer, count, ppos, + create_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -1490,10 +884,11 @@ static const struct file_operations kprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, - tp->rp.kp.nmissed); + seq_printf(m, " %-44s %15lu %15lu\n", + ftrace_event_name(&tk->tp.call), tk->nhit, + tk->rp.kp.nmissed); return 0; } @@ -1518,122 +913,105 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; -/* Sum up total data length for dynamic arraies (strings) */ -static __kprobes int __get_data_size(struct trace_probe *tp, - struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, - struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - /* Kprobe handler */ -static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +static nokprobe_inline void +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; - tp->nhit++; + WARN_ON(call != ftrace_file->event_call); + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)kp->addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + entry->ip = (unsigned long)tk->rp.kp.addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); +} + +static void +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) +{ + struct event_file_link *link; - if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kprobe_trace_func(tk, regs, link->file); } +NOKPROBE_SYMBOL(kprobe_trace_func); /* Kretprobe handler */ -static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, - struct pt_regs *regs) +static nokprobe_inline void +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs, + struct ftrace_event_file *ftrace_file) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; + + WARN_ON(call != ftrace_file->event_call); + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, + size, irq_flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); +} + +static void +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct event_file_link *link; - if (!filter_current_check_discard(buffer, call, entry, event)) - trace_nowake_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kretprobe_trace_func(tk, ri, regs, link->file); } +NOKPROBE_SYMBOL(kretprobe_trace_func); /* Event entry printers */ -enum print_line_t +static enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -1646,7 +1024,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1669,7 +1047,7 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -enum print_line_t +static enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -1682,7 +1060,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1711,31 +1089,23 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -#undef DEFINE_FIELD -#define DEFINE_FIELD(type, item, name, is_signed) \ - do { \ - ret = trace_define_field(event_call, #type, name, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ - FILTER_OTHER); \ - if (ret) \ - return ret; \ - } while (0) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1747,17 +1117,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1765,182 +1137,135 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) -{ - int i; - int pos = 0; - - const char *fmt, *arg; - - if (!trace_probe_is_return(tp)) { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } else { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_print_fmt(struct trace_probe *tp) -{ - int len; - char *print_fmt; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1); - tp->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void kprobe_perf_func(struct kprobe *kp, - struct pt_regs *regs) +static void +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) return; - entry->ip = (unsigned long)kp->addr; + entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kprobe_perf_func); /* Kretprobe profile handler */ -static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, - struct pt_regs *regs) +static void +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) return; - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ -static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ +static int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { - struct trace_probe *tp = (struct trace_probe *)event->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event->data; + struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_probe(tp, TP_FLAG_TRACE); + return enable_trace_kprobe(tk, file); case TRACE_REG_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_TRACE); - return 0; + return disable_trace_kprobe(tk, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, TP_FLAG_PROFILE); + return enable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_UNREGISTER: - disable_trace_probe(tp, TP_FLAG_PROFILE); + return disable_trace_kprobe(tk, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } return 0; } -static __kprobes -int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(kp, regs); + tk->nhit++; + + if (tk->tp.flags & TP_FLAG_TRACE) + kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(kp, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kprobe_perf_func(tk, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kprobe_dispatcher); -static __kprobes -int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); + + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(ri, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(ri, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kretprobe_dispatcher); static struct trace_event_functions kretprobe_funcs = { .trace = print_kretprobe_event @@ -1950,21 +1275,21 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_probe_event(struct trace_probe *tp) +static int register_kprobe_event(struct trace_kprobe *tk) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; int ret; /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (trace_probe_is_return(tp)) { + if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } - if (set_print_fmt(tp) < 0) + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); if (!ret) { @@ -1973,21 +1298,26 @@ static int register_probe_event(struct trace_probe *tp) } call->flags = 0; call->class->reg = kprobe_register; - call->data = tp; + call->data = tk; ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register kprobe event: %s\n", call->name); + pr_info("Failed to register kprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } return ret; } -static void unregister_probe_event(struct trace_probe *tp) +static int unregister_kprobe_event(struct trace_kprobe *tk) { + int ret; + /* tp->event is unregistered in trace_remove_event_call() */ - trace_remove_event_call(&tp->call); - kfree(tp->call.print_fmt); + ret = trace_remove_event_call(&tk->tp.call); + if (!ret) + kfree(tk->tp.call.print_fmt); + return ret; } /* Make a debugfs interface for controlling probe points */ @@ -1996,7 +1326,7 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; - if (register_module_notifier(&trace_probe_module_nb)) + if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; d_tracer = tracing_init_dentry(); @@ -2035,44 +1365,77 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, return a1 + a2 + a3 + a4 + a5 + a6; } +static struct ftrace_event_file * +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) + if (file->event_call == &tk->tp.call) + return file; + + return NULL; +} + +/* + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this + * stage, we can do this lockless. + */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; int (*target)(int, int, int, int, int, int); - struct trace_probe *tp; + struct trace_kprobe *tk; + struct ftrace_event_file *file; + + if (tracing_is_disabled()) + return -ENODEV; target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); - ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)"); + ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)", + create_trace_kprobe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function entry.\n"); + pr_warn("error on probing function entry.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting new probe.\n"); warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_kprobe(tk, file); + } } - ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " - "$retval"); + ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " + "$retval", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on probing function return.\n"); + pr_warn("error on probing function return.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting new probe.\n"); + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting 2nd new probe.\n"); warn++; - } else - enable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_kprobe(tk, file); + } } if (warn) @@ -2081,34 +1444,46 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); /* Disable trace points before removing it */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting test probe.\n"); + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting test probe.\n"); warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_kprobe(tk, file); + } - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { - pr_warning("error on getting 2nd test probe.\n"); + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting 2nd test probe.\n"); warn++; - } else - disable_trace_probe(tp, TP_FLAG_TRACE); + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_kprobe(tk, file); + } - ret = command_trace_probe("-:testprobe"); + ret = traceprobe_command("-:testprobe", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); + pr_warn("error on deleting a probe.\n"); warn++; } - ret = command_trace_probe("-:testprobe2"); + ret = traceprobe_command("-:testprobe2", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { - pr_warning("error on deleting a probe.\n"); + pr_warn("error on deleting a probe.\n"); warn++; } end: - release_all_trace_probes(); + release_all_trace_kprobes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e..0abd9b86347 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr) overrun_detected = false; prev_overruns = 0; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } static int mmio_trace_init(struct trace_array *tr) @@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) if (drv) ret += trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_printf(s, " \n"); + ret += trace_seq_puts(s, " \n"); return ret; } @@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter) struct header_iter *hiter; struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION 20070824\n"); + trace_seq_puts(s, "VERSION 20070824\n"); hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); if (!hiter) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { unsigned long cnt = atomic_xchg(&dropped_count, 0); - unsigned long over = ring_buffer_overruns(iter->tr->buffer); + unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); if (over > prev_overruns) cnt += over - prev_overruns; @@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_printf(s, "rw what?\n"); + ret = trace_seq_puts(s, "rw what?\n"); break; } if (ret) @@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_printf(s, "map what?\n"); + ret = trace_seq_puts(s, "map what?\n"); break; } if (ret) @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct ftrace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); @@ -323,14 +323,14 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = tr->data[smp_processor_id()]; + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_rw(tr, data, rw); } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct ftrace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) struct trace_array_cpu *data; preempt_disable(); - data = tr->data[smp_processor_id()]; + data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_map(tr, data, map); preempt_enable(); } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 394f94417e2..fcf0a9e4891 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr) * If you don't implement it, then the flag setting will be * automatically accepted. */ -static int nop_set_flag(u32 old_flags, u32 bit, int set) +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* * Note that you don't need to update nop_flags.val yourself. @@ -91,11 +91,11 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif .flags = &nop_flags, - .set_flag = nop_set_flag + .set_flag = nop_set_flag, + .allow_instances = true, }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff355594..f3dad80c20b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) return ret; } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bputs_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_puts(s, field->str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -62,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%s", field->buf); + ret = trace_seq_puts(s, field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -110,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) EXPORT_SYMBOL_GPL(trace_seq_printf); /** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string @@ -264,7 +308,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) return ret; } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path) { unsigned char *p; @@ -300,7 +344,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long mask; const char *str; const char *ret = p->buffer + p->len; - int i; + int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -310,14 +354,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, str = flag_array[i].name; flags &= ~mask; - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); + else + first = 0; trace_seq_puts(p, str); } /* check for left over flags */ if (flags) { - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); trace_seq_printf(p, "0x%lx", flags); } @@ -344,7 +390,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -370,7 +416,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); @@ -381,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); #endif const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = p->buffer + p->len; + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + +const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; @@ -395,6 +454,63 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) +{ + struct ftrace_event_call *event; + struct trace_seq *s = &iter->seq; + struct trace_seq *p = &iter->tmp_seq; + struct trace_entry *entry; + int ret; + + event = container_of(trace_event, struct ftrace_event_call, event); + entry = iter->ent; + + if (entry->type != event->event.type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_init(p); + ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); + +static int ftrace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) +{ + struct trace_seq *s = &iter->seq; + int ret; + + ret = trace_seq_printf(s, "%s: ", name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_vprintf(s, fmt, ap); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = ftrace_output_raw(iter, name, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_output_call); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { @@ -514,14 +630,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ret) ret = trace_seq_puts(s, "??"); if (ret) - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); } if (mm) @@ -535,7 +651,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) int ret; if (!ip) - return trace_seq_printf(s, "0"); + return trace_seq_putc(s, '0'); if (sym_flags & TRACE_ITER_SYM_OFFSET) ret = seq_print_sym_offset(s, "%s", ip); @@ -574,8 +690,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; - need_resched = - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'N'; + break; + case TRACE_FLAG_NEED_RESCHED: + need_resched = 'n'; + break; + case TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'p'; + break; + default: + need_resched = '.'; + break; + } + hardsoft_irq = (hardirq && softirq) ? 'H' : hardirq ? 'h' : @@ -608,24 +739,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) return trace_print_lat_fmt(s, entry); } -static unsigned long preempt_mark_thresh = 100; +static unsigned long preempt_mark_thresh_us = 100; static int -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, - unsigned long rel_usecs) +lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { - return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, - rel_usecs > preempt_mark_thresh ? '!' : - rel_usecs > 1 ? '+' : ' '); + unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; + unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; + unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; + unsigned long long rel_ts = next_ts - iter->ts; + struct trace_seq *s = &iter->seq; + + if (in_ns) { + abs_ts = ns2usecs(abs_ts); + rel_ts = ns2usecs(rel_ts); + } + + if (verbose && in_ns) { + unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); + unsigned long abs_msec = (unsigned long)abs_ts; + unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); + unsigned long rel_msec = (unsigned long)rel_ts; + + return trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + } else if (verbose && !in_ns) { + return trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + } else if (!verbose && in_ns) { + return trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + rel_ts > preempt_mark_thresh_us ? '!' : + rel_ts > 1 ? '+' : ' '); + } else { /* !verbose && !in_ns */ + return trace_seq_printf(s, " %4lld: ", abs_ts); + } } int trace_print_context(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, USEC_PER_SEC); - unsigned long secs = (unsigned long)t; + unsigned long long t; + unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; int ret; @@ -642,46 +803,49 @@ int trace_print_context(struct trace_iterator *iter) return 0; } - return trace_seq_printf(s, " %5lu.%06lu: ", - secs, usec_rem); + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(iter->ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + } else + return trace_seq_printf(s, " %12llu: ", iter->ts); } int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; int ret; + /* trace_find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, &next_ts); unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); - unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); - unsigned long rel_usecs; + + /* Restore the original ent_size */ + iter->ent_size = ent_size; if (!next_entry) next_ts = iter->ts; - rel_usecs = ns2usecs(next_ts - iter->ts); if (verbose) { char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" - " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx, - ns2usecs(iter->ts), - abs_usecs / USEC_PER_MSEC, - abs_usecs % USEC_PER_MSEC, - rel_usecs / USEC_PER_MSEC, - rel_usecs % USEC_PER_MSEC); + ret = trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); } else { ret = lat_print_generic(s, entry, iter->cpu); - if (ret) - ret = lat_print_timestamp(s, abs_usecs, rel_usecs); } + if (ret) + ret = lat_print_timestamp(iter, next_ts); + return ret; } @@ -704,12 +868,11 @@ static int task_state_char(unsigned long state) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - struct hlist_node *n; unsigned key; key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, &event_hash[key], node) { if (event->type == type) return event; } @@ -749,12 +912,12 @@ static int trace_search_list(struct list_head **list) void trace_event_read_lock(void) { - down_read(&trace_event_mutex); + down_read(&trace_event_sem); } void trace_event_read_unlock(void) { - up_read(&trace_event_mutex); + up_read(&trace_event_sem); } /** @@ -777,7 +940,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); if (WARN_ON(!event)) goto out; @@ -832,14 +995,14 @@ int register_ftrace_event(struct trace_event *event) ret = event->type; out: - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write. */ int __unregister_ftrace_event(struct trace_event *event) { @@ -854,9 +1017,9 @@ int __unregister_ftrace_event(struct trace_event *event) */ int unregister_ftrace_event(struct trace_event *event) { - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __unregister_ftrace_event(event); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return 0; } @@ -888,14 +1051,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, goto partial; if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_printf(s, " <-")) + if (!trace_seq_puts(s, " <-")) goto partial; if (!seq_print_ip_sym(s, field->parent_ip, flags)) goto partial; } - if (!trace_seq_printf(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; return TRACE_TYPE_HANDLED; @@ -1134,7 +1297,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, if (!seq_print_ip_sym(s, *p, flags)) goto partial; - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; } @@ -1183,6 +1346,64 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bputs_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { + .trace = trace_bputs_print, + .raw = trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { + .type = TRACE_BPUTS, + .funcs = &trace_bputs_funcs, +}; + /* TRACE_BPRINT */ static enum print_line_t trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1295,6 +1516,7 @@ static struct trace_event *events[] __initdata = { &trace_wake_event, &trace_stack_event, &trace_user_stack_event, + &trace_bputs_event, &trace_bprint_event, &trace_print_event, NULL @@ -1318,4 +1540,4 @@ __init static int init_events(void) return 0; } -device_initcall(init_events); +early_initcall(init_events); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492..127a9d8c835 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@ #include "trace.h" extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem; #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f..2900817ba65 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) const char **iter; char *fmt; + /* allocate the trace_printk per cpu buffers */ + if (start != end) + trace_printk_init_buffers(); + mutex_lock(&btrace_mutex); for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); @@ -240,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; int start_index; + int last_index; start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; if (*pos < start_index) return __start___trace_bprintk_fmt + *pos; + /* + * The __tracepoint_str section is treated the same as the + * __trace_printk_fmt section. The difference is that the + * __trace_printk_fmt section should only be used by trace_printk() + * in a debugging environment, as if anything exists in that section + * the trace_prink() helper buffers are allocated, which would just + * waste space in a production environment. + * + * The __tracepoint_str sections on the other hand are used by + * tracepoints which need to map pointers to their strings to + * the ASCII text for userspace. + */ + last_index = start_index; + start_index = __stop___tracepoint_str - __start___tracepoint_str; + + if (*pos < last_index + start_index) + return __start___tracepoint_str + (*pos - last_index); + return find_next_mod_format(start_index, v, fmt, pos); } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 00000000000..d4b9fc22cd2 --- /dev/null +++ b/kernel/trace/trace_probe.c @@ -0,0 +1,726 @@ +/* + * Common code for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include "trace_probe.h" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + FIELD_STRING_IP, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent) \ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ +} \ +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") + +/* Print type function for string type */ +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); + +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); +DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +#define DEFINE_FETCH_retval(type) \ +void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); +DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; + long offset; + fetch_func_t fetch; + fetch_func_t fetch_size; +}; + +#define DEFINE_FETCH_deref(type) \ +void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + dprm->fetch(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); +DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) + +void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + struct deref_fetch_param *dprm = data; + unsigned long addr; + + call_fetch(&dprm->orig, regs, &addr); + if (addr && dprm->fetch_size) { + addr += dprm->offset; + dprm->fetch_size(regs, (void *)addr, dest); + } else + *(string_size *)dest = 0; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); + +static void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} +NOKPROBE_SYMBOL(update_deref_fetch_param); + +static void free_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} +NOKPROBE_SYMBOL(free_deref_fetch_param); + +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + + kfree(data); +} + +static const struct fetch_type *find_fetch_type(const char *type, + const struct fetch_type *ftbl) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + + type = strchr(type, '/'); + if (!type) + goto fail; + + type++; + if (kstrtoul(type, 0, &bs)) + goto fail; + + switch (bs) { + case 8: + return find_fetch_type("u8", ftbl); + case 16: + return find_fetch_type("u16", ftbl); + case 32: + return find_fetch_type("u32", ftbl); + case 64: + return find_fetch_type("u64", ftbl); + default: + goto fail; + } + } + + for (i = 0; ftbl[i].name; i++) { + if (strcmp(type, ftbl[i].name) == 0) + return &ftbl[i]; + } + +fail: + return NULL; +} + +/* Special function : only accept unsigned long */ +static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} +NOKPROBE_SYMBOL(fetch_kernel_stack_address); + +static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) +{ + *(unsigned long *)dest = user_stack_pointer(regs); +} +NOKPROBE_SYMBOL(fetch_user_stack_address); + +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn, + const struct fetch_type *ftbl) +{ + int i; + + if (type != &ftbl[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + + return NULL; +} + +/* Split symbol and offset. */ +int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because kstrtoul doesn't accept '+' */ + ret = kstrtoul(tmp + 1, 0, offset); + if (ret) + return ret; + + *tmp = '\0'; + } else + *offset = 0; + + return 0; +} + +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, bool is_return, + bool is_kprobe) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) + f->fn = t->fetch[FETCH_MTD_retval]; + else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) + return -EINVAL; + + if (is_kprobe) + f->fn = fetch_kernel_stack_address; + else + f->fn = fetch_user_stack_address; + } else if (isdigit(arg[5])) { + ret = kstrtoul(arg + 5, 10, ¶m); + if (ret || (is_kprobe && param > PARAM_MAX_STACK)) + ret = -EINVAL; + else { + f->fn = t->fetch[FETCH_MTD_stack]; + f->data = (void *)param; + } + } else + ret = -EINVAL; + } else + ret = -EINVAL; + + return ret; +} + +/* Recursive argument parser */ +static int parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, bool is_return, bool is_kprobe) +{ + const struct fetch_type *ftbl; + unsigned long param; + long offset; + char *tmp; + int ret = 0; + + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); + break; + + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + f->fn = t->fetch[FETCH_MTD_reg]; + f->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + + case '@': /* memory, file-offset or symbol */ + if (isdigit(arg[1])) { + ret = kstrtoul(arg + 1, 0, ¶m); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_memory]; + f->data = (void *)param; + } else if (arg[1] == '+') { + /* kprobes don't support file offsets */ + if (is_kprobe) + return -EINVAL; + + ret = kstrtol(arg + 2, 0, &offset); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_file_offset]; + f->data = (void *)offset; + } else { + /* uprobes don't support symbols */ + if (!is_kprobe) + return -EINVAL; + + ret = traceprobe_split_symbol_offset(arg + 1, &offset); + if (ret) + break; + + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->fetch[FETCH_MTD_symbol]; + } + break; + + case '+': /* deref memory */ + arg++; /* Skip '+', because kstrtol() rejects it. */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) + break; + + *tmp = '\0'; + ret = kstrtol(arg, 0, &offset); + + if (ret) + break; + + arg = tmp + 1; + tmp = strrchr(arg, ')'); + + if (tmp) { + struct deref_fetch_param *dprm; + const struct fetch_type *t2; + + t2 = find_fetch_type(NULL, ftbl); + *tmp = '\0'; + dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); + + if (!dprm) + return -ENOMEM; + + dprm->offset = offset; + dprm->fetch = t->fetch[FETCH_MTD_memory]; + dprm->fetch_size = get_fetch_size_function(t, + dprm->fetch, ftbl); + ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, + is_kprobe); + if (ret) + kfree(dprm); + else { + f->fn = t->fetch[FETCH_MTD_deref]; + f->data = (void *)dprm; + } + } + break; + } + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", t->name); + ret = -EINVAL; + } + + return ret; +} + +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + +/* String length checking wrapper */ +int traceprobe_parse_probe_arg(char *arg, ssize_t *size, + struct probe_arg *parg, bool is_return, bool is_kprobe) +{ + const struct fetch_type *ftbl; + const char *t; + int ret; + + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); + + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t, ftbl); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = *size; + *size += parg->type->size; + ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); + + if (ret >= 0 && t != NULL) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); + + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn, + ftbl); + parg->fetch_size.data = parg->fetch.data; + } + + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + + return 0; +} + +void traceprobe_update_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + update_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + update_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + update_symbol_cache(arg->fetch.data); +} + +void traceprobe_free_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + free_symbol_cache(arg->fetch.data); + + kfree(arg->name); + kfree(arg->comm); +} + +int traceprobe_command(const char *buf, int (*createfn)(int, char **)) +{ + char **argv; + int argc, ret; + + argc = 0; + ret = 0; + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, argv); + + argv_free(argv); + + return ret; +} + +#define WRITE_BUFSIZE 4096 + +ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(int, char **)) +{ + char *kbuf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = traceprobe_command(kbuf, createfn); + if (ret) + goto out; + } + ret = done; + +out: + kfree(kbuf); + + return ret; +} + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, + bool is_return) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!is_return) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +int set_print_fmt(struct trace_probe *tp, bool is_return) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0, is_return); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1, is_return); + tp->call.print_fmt = print_fmt; + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 00000000000..4f815fbce16 --- /dev/null +++ b/kernel/trace/trace_probe.h @@ -0,0 +1,400 @@ +/* + * Common header file for probe-based Dynamic events. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This code was copied from kernel/trace/trace_kprobe.h written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> +#include <linux/kprobes.h> +#include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> +#include <asm/bitsperlong.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 +#define TP_FLAG_REGISTERED 4 + + +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +static nokprobe_inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); + +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_bitfield, + FETCH_MTD_file_offset, + FETCH_MTD_END, +}; + +/* Fetch type information table */ +struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ + /* Fetch functions */ + fetch_func_t fetch[FETCH_MTD_END]; +}; + +struct fetch_param { + fetch_func_t fn; + void *data; +}; + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent); \ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type + +/* Declare macro for basic types */ +#define DECLARE_FETCH_FUNC(method, type) \ +extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ + void *data, void *dest) + +#define DECLARE_BASIC_FETCH_FUNCS(method) \ +DECLARE_FETCH_FUNC(method, u8); \ +DECLARE_FETCH_FUNC(method, u16); \ +DECLARE_FETCH_FUNC(method, u32); \ +DECLARE_FETCH_FUNC(method, u64) + +DECLARE_BASIC_FETCH_FUNCS(reg); +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(retval); +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(symbol); +DECLARE_FETCH_FUNC(symbol, string); +DECLARE_FETCH_FUNC(symbol, string_size); + +DECLARE_BASIC_FETCH_FUNCS(deref); +DECLARE_FETCH_FUNC(deref, string); +DECLARE_FETCH_FUNC(deref, string_size); + +DECLARE_BASIC_FETCH_FUNCS(bitfield); +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ +ASSIGN_FETCH_FUNC(file_offset, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define ASSIGN_FETCH_TYPE_END {} + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* + * Fetch type information table. + * It's declared as a weak symbol due to conditional compilation. + */ +extern __weak const struct fetch_type kprobes_fetch_type_table[]; +extern __weak const struct fetch_type uprobes_fetch_type_table[]; + +#ifdef CONFIG_KPROBE_EVENT +struct symbol_cache; +unsigned long update_symbol_cache(struct symbol_cache *sc); +void free_symbol_cache(struct symbol_cache *sc); +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +#else +/* uprobes do not support symbol fetch methods */ +#define fetch_symbol_u8 NULL +#define fetch_symbol_u16 NULL +#define fetch_symbol_u32 NULL +#define fetch_symbol_u64 NULL +#define fetch_symbol_string NULL +#define fetch_symbol_string_size NULL + +struct symbol_cache { +}; +static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) +{ + return 0; +} + +static inline void __used free_symbol_cache(struct symbol_cache *sc) +{ +} + +static inline struct symbol_cache * __used +alloc_symbol_cache(const char *sym, long offset) +{ + return NULL; +} +#endif /* CONFIG_KPROBE_EVENT */ + +struct probe_arg { + struct fetch_param fetch; + struct fetch_param fetch_size; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ +}; + +struct trace_probe { + unsigned int flags; /* For TP_FLAG_* */ + struct ftrace_event_class class; + struct ftrace_event_call call; + struct list_head files; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static inline bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + +static nokprobe_inline void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) +{ + return fprm->fn(regs, fprm->data, dest); +} + +/* Check the name is good for event/group/fields */ +static inline int is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + +static inline struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +{ + struct event_file_link *link; + + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; + + return NULL; +} + +extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, + struct probe_arg *parg, bool is_return, bool is_kprobe); + +extern int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg); + +extern void traceprobe_update_arg(struct probe_arg *arg); +extern void traceprobe_free_probe_arg(struct probe_arg *arg); + +extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); + +extern ssize_t traceprobe_probes_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos, + int (*createfn)(int, char**)); + +extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +extern int set_print_fmt(struct trace_probe *tp, bool is_return); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 7e62c0a1845..3f34dc9b40f 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); @@ -101,10 +101,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - if (!filter_check_discard(call, entry, buffer, event)) - ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr->buffer, flags, 6, pc); - ftrace_trace_userstack(tr->buffer, flags, pc); + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); } static void @@ -125,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b5..19bd8928ce9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -7,7 +7,7 @@ * Based on code from the latency_tracer, that is: * * Copyright (C) 2004-2006 Ingo Molnar - * Copyright (C) 2004 William Lee Irwin III + * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/module.h> #include <linux/fs.h> @@ -15,8 +15,9 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <trace/events/sched.h> - #include "trace.h" static struct trace_array *wakeup_trace; @@ -27,6 +28,8 @@ static int wakeup_cpu; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; +static int wakeup_dl; +static int tracing_dl = 0; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -36,7 +39,8 @@ static void __wakeup_reset(struct trace_array *tr); static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; +static bool function_enabled; #define TRACE_DISPLAY_GRAPH 1 @@ -89,7 +93,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (cpu != wakeup_current_cpu) goto out_enable; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; @@ -108,7 +112,8 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; @@ -125,23 +130,64 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) atomic_dec(&data->disabled); preempt_enable_notrace(); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, -}; #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); + else + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_wakeup_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static void wakeup_function_set(struct trace_array *tr, int set) +{ + if (set) + register_wakeup_function(tr, is_graph(), 1); + else + unregister_wakeup_function(tr, is_graph()); +} + +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +{ + struct tracer *tracer = tr->current_trace; + + if (mask & TRACE_ITER_FUNCTION) + wakeup_function_set(tr, set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_wakeup_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -151,18 +197,16 @@ static int start_func_tracer(int graph) return ret; } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_wakeup_function(tr, graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (!(bit & TRACE_DISPLAY_GRAPH)) @@ -171,12 +215,12 @@ static int wakeup_set_flag(u32 old_flags, u32 bit, int set) if (!(is_graph() ^ set)) return 0; - stop_func_tracer(!set); + stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); - tracing_max_latency = 0; + tr->max_latency = 0; - return start_func_tracer(set); + return start_func_tracer(tr, set); } static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -264,7 +308,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } @@ -299,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -352,7 +397,7 @@ probe_wakeup_sched_switch(void *ignore, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -364,7 +409,7 @@ probe_wakeup_sched_switch(void *ignore, goto out_unlock; /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -373,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, T1 = ftrace_now(cpu); delta = T1-T0; - if (!report_latency(delta)) + if (!report_latency(wakeup_trace, delta)) goto out_unlock; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + wakeup_trace->max_latency = delta; update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); } @@ -386,13 +431,14 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) { wakeup_cpu = -1; wakeup_prio = -1; + tracing_dl = 0; if (wakeup_task) put_task_struct(wakeup_task); @@ -404,7 +450,7 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); local_irq_save(flags); arch_spin_lock(&wakeup_lock); @@ -428,13 +474,21 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if ((wakeup_rt && !rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= current->prio) + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; pc = preempt_count(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -442,7 +496,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) arch_spin_lock(&wakeup_lock); /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) goto out_locked; /* reset the trace */ @@ -452,12 +507,21 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = 1; + else + tracing_dl = 0; + wakeup_task = p; get_task_struct(wakeup_task); local_save_flags(flags); - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -471,7 +535,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) @@ -517,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(is_graph())) + if (start_func_tracer(tr, is_graph())) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -530,44 +594,75 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(is_graph()); + stop_func_tracer(tr, is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } +static bool wakeup_busy; + static int __wakeup_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; - tracing_max_latency = 0; + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + + tr->max_latency = 0; wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); + + wakeup_busy = true; return 0; } static int wakeup_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); } static int wakeup_rt_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); } +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = 1; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + wakeup_busy = false; } static void wakeup_tracer_start(struct trace_array *tr) @@ -588,17 +683,19 @@ static struct tracer wakeup_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .print_max = 1, + .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif .open = wakeup_trace_open, .close = wakeup_trace_close, - .use_max_tr = 1, + .allow_instances = true, + .use_max_tr = true, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -608,18 +705,40 @@ static struct tracer wakeup_rt_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, - .print_max = 1, + .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif .open = wakeup_trace_open, .close = wakeup_trace_close, - .use_max_tr = 1, + .allow_instances = true, + .use_max_tr = true, +}; + +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = true, }; __init static int init_wakeup_tracer(void) @@ -634,6 +753,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + return 0; } -device_initcall(init_wakeup_tracer); +core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977f..5ef60499dc8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry) return 0; } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer_event *event; struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { + while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* @@ -58,16 +58,16 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; /* Don't allow flipping of max traces now */ local_irq_save(flags); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&buf->tr->max_lock); - cnt = ring_buffer_entries(tr->buffer); + cnt = ring_buffer_entries(buf->buffer); /* * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,12 +78,12 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) */ tracing_off(); for_each_possible_cpu(cpu) { - ret = trace_test_buffer_cpu(tr, cpu); + ret = trace_test_buffer_cpu(buf, cpu); if (ret) break; } tracing_on(); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&buf->tr->max_lock); local_irq_restore(flags); if (count) @@ -103,54 +103,62 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe1_cnt++; } static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe2_cnt++; } static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe3_cnt++; } static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_global_cnt++; } static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_dyn_cnt++; } static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, -}; - -static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) @@ -172,7 +180,7 @@ static void reset_counts(void) trace_selftest_test_dyn_cnt = 0; } -static int trace_selftest_ops(int cnt) +static int trace_selftest_ops(struct trace_array *tr, int cnt) { int save_ftrace_enabled = ftrace_enabled; struct ftrace_ops *dyn_ops; @@ -207,7 +215,11 @@ static int trace_selftest_ops(int cnt) register_ftrace_function(&test_probe1); register_ftrace_function(&test_probe2); register_ftrace_function(&test_probe3); - register_ftrace_function(&test_global); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } DYN_FTRACE_TEST_NAME(); @@ -219,8 +231,10 @@ static int trace_selftest_ops(int cnt) goto out; if (trace_selftest_test_probe3_cnt != 1) goto out; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } DYN_FTRACE_TEST_NAME2(); @@ -256,8 +270,10 @@ static int trace_selftest_ops(int cnt) goto out_free; if (trace_selftest_test_probe3_cnt != 3) goto out_free; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } if (trace_selftest_test_dyn_cnt == 0) goto out_free; @@ -282,7 +298,9 @@ static int trace_selftest_ops(int cnt) unregister_ftrace_function(&test_probe1); unregister_ftrace_function(&test_probe2); unregister_ftrace_function(&test_probe3); - unregister_ftrace_function(&test_global); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); /* Make sure everything is off */ reset_counts(); @@ -302,12 +320,11 @@ static int trace_selftest_ops(int cnt) } /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; unsigned long count; char *func_name; int ret; @@ -318,7 +335,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* enable tracing, and record the filter function */ ftrace_enabled = 1; - tracer_enabled = 1; /* passed in by parameter to fool gcc from optimizing */ func(); @@ -344,7 +360,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* we should have nothing in the buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); if (ret) goto out; @@ -365,7 +381,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); tracing_start(); /* we should only have one item */ @@ -377,45 +393,277 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, } /* Test the ops with global tracing running */ - ret = trace_selftest_ops(1); + ret = trace_selftest_ops(tr, 1); trace->reset(tr); out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; /* Enable tracing on all functions again */ ftrace_set_global_filter(NULL, 0, 1); /* Test the ops with global tracing off */ if (!ret) - ret = trace_selftest_ops(2); + ret = trace_selftest_ops(tr, 2); + + return ret; +} + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + if (trace_selftest_recursion_cnt++ > 10) + return; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 1) { + pr_cont("*callback not called once (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; return ret; } #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + if (pt_regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace * buffer to see if all is in order. */ -int +__init int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { int save_ftrace_enabled = ftrace_enabled; - int save_tracer_enabled = tracer_enabled; unsigned long count; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* make sure msleep has been recorded */ msleep(1); /* start the tracing */ ftrace_enabled = 1; - tracer_enabled = 1; ret = tracer_init(trace, tr); if (ret) { @@ -430,7 +678,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -442,10 +690,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ret = trace_selftest_startup_dynamic_tracing(trace, tr, DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + + ret = trace_selftest_function_recursion(); + if (ret) + goto out; + ret = trace_selftest_function_regs(); out: ftrace_enabled = save_ftrace_enabled; - tracer_enabled = save_tracer_enabled; /* kill ftrace totally if we failed */ if (ret) @@ -461,8 +715,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -472,8 +724,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) - __ftrace_dump(false, DUMP_ALL); + if (ftrace_dump_on_oops) { + ftrace_dump(DUMP_ALL); + /* ftrace_dump() disables tracing */ + tracing_on(); + } return 0; } @@ -484,18 +739,25 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) * Pretty much the same than for the function tracer from which the selftest * has been borrowed. */ -int +__init int trace_selftest_startup_function_graph(struct tracer *trace, struct trace_array *tr) { int ret; unsigned long count; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs */ - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); @@ -518,7 +780,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -545,7 +807,7 @@ out: int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -557,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable interrupts for a bit */ local_irq_disable(); udelay(100); @@ -573,9 +835,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -584,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -594,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -619,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption for a bit */ preempt_disable(); udelay(100); @@ -635,9 +897,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -646,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -656,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -681,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption and interrupts for a bit */ preempt_disable(); @@ -701,11 +963,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (ret) goto out; @@ -716,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; + tr->max_latency = 0; tracing_start(); trace->start(tr); @@ -731,11 +993,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -747,7 +1009,7 @@ out: tracing_start(); out_no_start: trace->reset(tr); - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -765,11 +1027,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - /* Make this a RT thread, doesn't need to be too high */ - static const struct sched_param param = { .sched_priority = 5 }; + /* Make this a -deadline thread */ + static const struct sched_attr attr = { + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL + }; struct completion *x = data; - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_setattr(current, &attr); /* Make it know we have a new prio */ complete(x); @@ -778,11 +1045,13 @@ static int trace_wakeup_test_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); schedule(); + complete(x); + /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* - * This is an RT task, do short sleeps to let - * others run. + * This will likely be the system top priority + * task, do short sleeps to let others run. */ msleep(100); } @@ -793,23 +1062,23 @@ static int trace_wakeup_test_thread(void *data) int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; struct task_struct *p; - struct completion isrt; + struct completion is_ready; unsigned long count; int ret; - init_completion(&isrt); + init_completion(&is_ready); - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -819,39 +1088,37 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; - /* sleep to let the RT thread sleep too */ - msleep(100); + while (p->on_rq) { + /* + * Sleep to make sure the -deadline thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ + init_completion(&is_ready); wake_up_process(p); - /* give a little time to let the thread wake up */ - msleep(100); + /* Wait for the task to wake up */ + wait_for_completion(&is_ready); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); + printk("ret = %d\n", ret); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); - tracing_max_latency = save_max; + tr->max_latency = save_max; /* kill the thread */ kthread_stop(p); @@ -884,7 +1151,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -916,7 +1183,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242..8a4e5cb66a4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,7 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/magic.h> #include <asm/setup.h> @@ -20,61 +21,114 @@ #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry 1 +#else +# define fentry 0 +#endif + static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */ static struct stack_trace max_stack_trace = { - .max_entries = STACK_TRACE_ENTRIES, - .entries = stack_dump_trace, + .max_entries = STACK_TRACE_ENTRIES - 1, + .entries = &stack_dump_trace[1], }; static unsigned long max_stack_size; static arch_spinlock_t max_stack_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + size, (void *)stack_dump_trace[i]); + } +} + +static inline void +check_stack(unsigned long ip, unsigned long *stack) { - unsigned long this_size, flags; - unsigned long *p, *top, *start; + unsigned long this_size, flags; unsigned long *p, *top, *start; + static int tracer_frame; + int frame_size = ACCESS_ONCE(tracer_frame); int i; - this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; + /* Remove the frame of the tracer */ + this_size -= frame_size; if (this_size <= max_stack_size) return; /* we do not handle interrupt stacks yet */ - if (!object_is_on_stack(&this_size)) + if (!object_is_on_stack(stack)) return; local_irq_save(flags); arch_spin_lock(&max_stack_lock); + /* In case another CPU set the tracer_frame on us */ + if (unlikely(!frame_size)) + this_size -= tracer_frame; + /* a race could have already updated it */ if (this_size <= max_stack_size) goto out; max_stack_size = this_size; - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; + max_stack_trace.nr_entries = 0; + + if (using_ftrace_ops_list_func()) + max_stack_trace.skip = 4; + else + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); /* + * Add the passed in ip from the function tracer. + * Searching for this on the stack will skip over + * most of the overhead from the stack tracer itself. + */ + stack_dump_trace[0] = ip; + max_stack_trace.nr_entries++; + + /* * Now find where in the stack these are. */ i = 0; - start = &this_size; + start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -98,6 +152,18 @@ static inline void check_stack(void) found = 1; /* Start the search from here */ start = p + 1; + /* + * We do not want to show the overhead + * of the stack tracer stack in the + * max stack. If we haven't figured + * out what that is, then figure it out + * now. + */ + if (unlikely(!tracer_frame) && i == 1) { + tracer_frame = (p - stack) * + sizeof(unsigned long); + max_stack_size -= tracer_frame; + } } } @@ -105,19 +171,24 @@ static inline void check_stack(void) i++; } + if ((current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC)) { + print_max_stack(); + BUG(); + } + out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { + unsigned long stack; int cpu; - if (unlikely(!ftrace_enabled || stack_trace_disabled)) - return; - preempt_disable_notrace(); cpu = raw_smp_processor_id(); @@ -125,7 +196,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) if (per_cpu(trace_active, cpu)++ != 0) goto out; - check_stack(); + /* + * When fentry is used, the traced function does not get + * its stack frame set up, and we lose the parent. + * The ip is pretty useless because the function tracer + * was called before that function set up its stack frame. + * In this case, we use the parent ip. + * + * By adding the return address of either the parent ip + * or the current ip we can disregard most of the stack usage + * caused by the stack tracer itself. + * + * The function tracer always reports the address of where the + * mcount call was, but the stack will hold the return address. + */ + if (fentry) + ip = parent_ip; + else + ip += MCOUNT_INSN_SIZE; + + check_stack(ip, &stack); out: per_cpu(trace_active, cpu)--; @@ -136,6 +226,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t @@ -324,7 +415,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -373,6 +464,8 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; trace_create_file("stack_max_size", 0644, d_tracer, &max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e7..7af67360b33 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, - struct rb_node *node) +static void __reset_stat_session(struct stat_session *session) { - struct stat_node *snode; - struct rb_node *parent = rb_parent(node); - - if (node->rb_left) - return node->rb_left; - else if (node->rb_right) - return node->rb_right; - else { - if (!parent) - ; - else if (parent->rb_left == node) - parent->rb_left = NULL; - else - parent->rb_right = NULL; + struct stat_node *snode, *n; - snode = container_of(node, struct stat_node, node); - if (ts->stat_release) - ts->stat_release(snode->stat); + rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { + if (session->ts->stat_release) + session->ts->stat_release(snode->stat); kfree(snode); - - return parent; } -} - -static void __reset_stat_session(struct stat_session *session) -{ - struct rb_node *node = session->stat_root.rb_node; - - while (node) - node = release_next(session->ts, node); session->stat_root = RB_ROOT; } @@ -307,6 +276,8 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); + if (!d_tracing) + return 0; stat_dir = debugfs_create_dir("trace_stat", d_tracing); if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb654542c1a..759d5e00451 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,6 @@ #include <trace/syscall.h> #include <trace/events/syscalls.h> +#include <linux/syscalls.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ @@ -11,18 +12,11 @@ #include "trace.h" static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); - -static int syscall_enter_define_fields(struct ftrace_event_call *call); -static int syscall_exit_define_fields(struct ftrace_event_call *call); + enum trace_reg type, void *data); static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) @@ -32,30 +26,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, -}; - -struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, -}; - -struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, -}; - -struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), - .raw_init = init_syscall_trace, -}; - extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; @@ -67,13 +37,45 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name /* * Only compare after the "sys" prefix. Archs that use * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted + * with ".SyS" or ".sys" instead of "sys", leading to an unwanted * mismatch. */ return !strcmp(sym + 3, name + 3); } #endif +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + * *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + if (unlikely(arch_trace_is_compat_syscall(regs))) + return -1; + + return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -104,7 +106,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -enum print_line_t +static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -157,7 +159,7 @@ end: return TRACE_TYPE_HANDLED; } -enum print_line_t +static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -173,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, entry = syscall_nr_to_meta(syscall); if (!entry) { - trace_seq_printf(s, "\n"); + trace_seq_putc(s, '\n'); return TRACE_TYPE_HANDLED; } @@ -198,8 +200,8 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -static -int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +static int __init +__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { int i; int pos = 0; @@ -226,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) return pos; } -static int set_syscall_print_fmt(struct ftrace_event_call *call) +static int __init set_syscall_print_fmt(struct ftrace_event_call *call) { char *print_fmt; int len; @@ -251,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call) return 0; } -static void free_syscall_print_fmt(struct ftrace_event_call *call) +static void __init free_syscall_print_fmt(struct ftrace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -259,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -282,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -297,19 +299,29 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { + struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; - int size; + unsigned long irq_flags; + int pc; int syscall_nr; + int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_enter_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ + ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -318,8 +330,12 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, - sys_data->enter_event->event.type, size, 0, 0); + local_save_flags(irq_flags); + pc = preempt_count(); + + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, + sys_data->enter_event->event.type, size, irq_flags, pc); if (!event) return; @@ -327,31 +343,45 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_current_check_discard(buffer, sys_data->enter_event, - entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { + struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_exit_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ + ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, - sys_data->exit_event->event.type, sizeof(*entry), 0, 0); + local_save_flags(irq_flags); + pc = preempt_count(); + + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, + sys_data->exit_event->event.type, sizeof(*entry), + irq_flags, pc); if (!event) return; @@ -359,13 +389,14 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_current_check_discard(buffer, sys_data->exit_event, - entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -373,33 +404,37 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); + if (!tr->sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, enabled_enter_syscalls); - sys_refcount_enter++; + rcu_assign_pointer(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_enter--; - clear_bit(num, enabled_enter_syscalls); - if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter, NULL); + tr->sys_refcount_enter--; + rcu_assign_pointer(tr->enter_syscall_files[num], NULL); + if (!tr->sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -407,32 +442,34 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); + if (!tr->sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, enabled_exit_syscalls); - sys_refcount_exit++; + rcu_assign_pointer(tr->exit_syscall_files[num], file); + tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_exit--; - clear_bit(num, enabled_exit_syscalls); - if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit, NULL); + tr->sys_refcount_exit--; + rcu_assign_pointer(tr->exit_syscall_files[num], NULL); + if (!tr->sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } -int init_syscall_trace(struct ftrace_event_call *call) +static int __init init_syscall_trace(struct ftrace_event_call *call) { int id; int num; @@ -457,19 +494,43 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class __refdata event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class __refdata event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, +}; + unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; int i; - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); + syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), + GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; @@ -487,7 +548,7 @@ int __init init_ftrace_syscalls(void) return 0; } -core_initcall(init_ftrace_syscalls); +early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS @@ -505,7 +566,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -513,15 +576,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* get the size after alignment with the u32 buffer size field */ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) - return; - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->event.type, regs, &rctx); if (!rec) @@ -530,12 +593,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - - head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -int perf_sysenter_enable(struct ftrace_event_call *call) +static int perf_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -556,7 +617,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) return ret; } -void perf_sysenter_disable(struct ftrace_event_call *call) +static void perf_sysenter_disable(struct ftrace_event_call *call) { int num; @@ -579,7 +640,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; - syscall_nr = syscall_get_nr(current, regs); + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; @@ -587,18 +650,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) + return; + /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - /* - * Impossible, but be paranoid with the future - * How to put this check outside runtime? - */ - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "exit event has grown above perf buffer size")) - return; - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->event.type, regs, &rctx); if (!rec) @@ -606,12 +665,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - - head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } -int perf_sysexit_enable(struct ftrace_event_call *call) +static int perf_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -632,7 +689,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) return ret; } -void perf_sysexit_disable(struct ftrace_event_call *call) +static void perf_sysexit_disable(struct ftrace_event_call *call) { int num; @@ -649,13 +706,15 @@ void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_enter(event); + return reg_event_syscall_enter(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_enter(event); + unreg_event_syscall_enter(file, event); return 0; #ifdef CONFIG_PERF_EVENTS @@ -664,19 +723,26 @@ static int syscall_enter_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; } static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_exit(event); + return reg_event_syscall_exit(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_exit(event); + unreg_event_syscall_exit(file, event); return 0; #ifdef CONFIG_PERF_EVENTS @@ -685,6 +751,11 @@ static int syscall_exit_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; #endif } return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 00000000000..3c9b97e6b1f --- /dev/null +++ b/kernel/trace/trace_uprobe.c @@ -0,0 +1,1340 @@ +/* + * uprobes-based tracing events + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (C) IBM Corporation, 2010-2012 + * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include <linux/namei.h> +#include <linux/string.h> + +#include "trace_probe.h" + +#define UPROBE_EVENT_SYSTEM "uprobes" + +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return) \ + (sizeof(struct uprobe_trace_entry_head) + \ + sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return) \ + ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + +/* + * uprobe event core functions + */ +struct trace_uprobe { + struct list_head list; + struct trace_uprobe_filter filter; + struct uprobe_consumer consumer; + struct inode *inode; + char *filename; + unsigned long offset; + unsigned long nhit; + struct trace_probe tp; +}; + +#define SIZEOF_TRACE_UPROBE(n) \ + (offsetof(struct trace_uprobe, tp.args) + \ + (sizeof(struct probe_arg) * (n))) + +static int register_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu); + +static DEFINE_MUTEX(uprobe_lock); +static LIST_HEAD(uprobe_list); + +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs); + +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long ret; + unsigned long addr = user_stack_pointer(regs); + + addr = adjust_stack_addr(addr, n); + + if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) + return 0; + + return ret; +} + +/* + * Uprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)get_user_stack_nth(regs, \ + ((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ + void *addr, void *dest) \ +{ \ + type retval; \ + void __user *vaddr = (void __force __user *) addr; \ + \ + if (copy_from_user(&retval, vaddr, sizeof(type))) \ + *(type *)dest = 0; \ + else \ + *(type *) dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + u32 rloc = *(u32 *)dest; + int maxlen = get_rloc_len(rloc); + u8 *dst = get_rloc_data(dest); + void __user *src = (void __force __user *) addr; + + if (!maxlen) + return; + + ret = strncpy_from_user(dst, src, maxlen); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); + } else { + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + } +} + +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int len; + void __user *vaddr = (void __force __user *) addr; + + len = strnlen_user(vaddr, MAX_STRING_SIZE); + + if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +static unsigned long translate_user_vaddr(void *file_offset) +{ + unsigned long base_addr; + struct uprobe_dispatch_data *udd; + + udd = (void *) current->utask->vaddr; + + base_addr = udd->bp_addr - udd->tu->offset; + return base_addr + (unsigned long)file_offset; +} + +#define DEFINE_FETCH_file_offset(type) \ +static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ + void *offset, void *dest)\ +{ \ + void *vaddr = (void *)translate_user_vaddr(offset); \ + \ + FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +} +DEFINE_BASIC_FETCH_FUNCS(file_offset) +DEFINE_FETCH_file_offset(string) +DEFINE_FETCH_file_offset(string_size) + +/* Fetch type information table */ +const struct fetch_type uprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ + return tu->consumer.ret_handler != NULL; +} + +/* + * Allocate new trace_uprobe and initialize it (including uprobes). + */ +static struct trace_uprobe * +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) +{ + struct trace_uprobe *tu; + + if (!event || !is_good_name(event)) + return ERR_PTR(-EINVAL); + + if (!group || !is_good_name(group)) + return ERR_PTR(-EINVAL); + + tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + if (!tu) + return ERR_PTR(-ENOMEM); + + tu->tp.call.class = &tu->tp.class; + tu->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tu->tp.call.name) + goto error; + + tu->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tu->tp.class.system) + goto error; + + INIT_LIST_HEAD(&tu->list); + INIT_LIST_HEAD(&tu->tp.files); + tu->consumer.handler = uprobe_dispatcher; + if (is_ret) + tu->consumer.ret_handler = uretprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); + tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; + return tu; + +error: + kfree(tu->tp.call.name); + kfree(tu); + + return ERR_PTR(-ENOMEM); +} + +static void free_trace_uprobe(struct trace_uprobe *tu) +{ + int i; + + for (i = 0; i < tu->tp.nr_args; i++) + traceprobe_free_probe_arg(&tu->tp.args[i]); + + iput(tu->inode); + kfree(tu->tp.call.class->system); + kfree(tu->tp.call.name); + kfree(tu->filename); + kfree(tu); +} + +static struct trace_uprobe *find_probe_event(const char *event, const char *group) +{ + struct trace_uprobe *tu; + + list_for_each_entry(tu, &uprobe_list, list) + if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && + strcmp(tu->tp.call.class->system, group) == 0) + return tu; + + return NULL; +} + +/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ +static int unregister_trace_uprobe(struct trace_uprobe *tu) +{ + int ret; + + ret = unregister_uprobe_event(tu); + if (ret) + return ret; + + list_del(&tu->list); + free_trace_uprobe(tu); + return 0; +} + +/* Register a trace_uprobe and probe_event */ +static int register_trace_uprobe(struct trace_uprobe *tu) +{ + struct trace_uprobe *old_tu; + int ret; + + mutex_lock(&uprobe_lock); + + /* register as an event */ + old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + tu->tp.call.class->system); + if (old_tu) { + /* delete old event */ + ret = unregister_trace_uprobe(old_tu); + if (ret) + goto end; + } + + ret = register_uprobe_event(tu); + if (ret) { + pr_warning("Failed to register probe event(%d)\n", ret); + goto end; + } + + list_add_tail(&tu->list, &uprobe_list); + +end: + mutex_unlock(&uprobe_lock); + + return ret; +} + +/* + * Argument syntax: + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] + * + * - Remove uprobe: -:[GRP/]EVENT + */ +static int create_trace_uprobe(int argc, char **argv) +{ + struct trace_uprobe *tu; + struct inode *inode; + char *arg, *event, *group, *filename; + char buf[MAX_EVENT_NAME_LEN]; + struct path path; + unsigned long offset; + bool is_delete, is_return; + int i, ret; + + inode = NULL; + ret = 0; + is_delete = false; + is_return = false; + event = NULL; + group = NULL; + + /* argc must be >= 1 */ + if (argv[0][0] == '-') + is_delete = true; + else if (argv[0][0] == 'r') + is_return = true; + else if (argv[0][0] != 'p') { + pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + arg = strchr(event, '/'); + + if (arg) { + group = event; + event = arg + 1; + event[-1] = '\0'; + + if (strlen(group) == 0) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + } + if (!group) + group = UPROBE_EVENT_SYSTEM; + + if (is_delete) { + int ret; + + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + mutex_lock(&uprobe_lock); + tu = find_probe_event(event, group); + + if (!tu) { + mutex_unlock(&uprobe_lock); + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + ret = unregister_trace_uprobe(tu); + mutex_unlock(&uprobe_lock); + return ret; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + pr_info("probe point must be have a filename.\n"); + return -EINVAL; + } + arg = strchr(argv[1], ':'); + if (!arg) { + ret = -EINVAL; + goto fail_address_parse; + } + + *arg++ = '\0'; + filename = argv[1]; + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_address_parse; + + inode = igrab(path.dentry->d_inode); + path_put(&path); + + if (!inode || !S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } + + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + + argc -= 2; + argv += 2; + + /* setup a probe */ + if (!event) { + char *tail; + char *ptr; + + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { + ret = -ENOMEM; + goto fail_address_parse; + } + + ptr = strpbrk(tail, ".-_"); + if (ptr) + *ptr = '\0'; + + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); + event = buf; + kfree(tail); + } + + tu = alloc_trace_uprobe(group, event, argc, is_return); + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + ret = PTR_ERR(tu); + goto fail_address_parse; + } + tu->offset = offset; + tu->inode = inode; + tu->filename = kstrdup(filename, GFP_KERNEL); + + if (!tu->filename) { + pr_info("Failed to allocate filename.\n"); + ret = -ENOMEM; + goto error; + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + /* Increment count for freeing args in error case */ + tu->tp.nr_args++; + + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) { + *arg++ = '\0'; + parg->name = kstrdup(argv[i], GFP_KERNEL); + } else { + arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + parg->name = kstrdup(buf, GFP_KERNEL); + } + + if (!parg->name) { + pr_info("Failed to allocate argument[%d] name.\n", i); + ret = -ENOMEM; + goto error; + } + + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", i, parg->name); + ret = -EINVAL; + goto error; + } + + if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { + pr_info("Argument[%d] name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + /* Parse fetch argument */ + ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + is_return, false); + if (ret) { + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + goto error; + } + } + + ret = register_trace_uprobe(tu); + if (ret) + goto error; + return 0; + +error: + free_trace_uprobe(tu); + return ret; + +fail_address_parse: + if (inode) + iput(inode); + + pr_info("Failed to parse address or file.\n"); + + return ret; +} + +static int cleanup_all_probes(void) +{ + struct trace_uprobe *tu; + int ret = 0; + + mutex_lock(&uprobe_lock); + while (!list_empty(&uprobe_list)) { + tu = list_entry(uprobe_list.next, struct trace_uprobe, list); + ret = unregister_trace_uprobe(tu); + if (ret) + break; + } + mutex_unlock(&uprobe_lock); + return ret; +} + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&uprobe_lock); + return seq_list_start(&uprobe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &uprobe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&uprobe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_uprobe *tu = v; + char c = is_ret_probe(tu) ? 'r' : 'p'; + int i; + + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, + ftrace_event_name(&tu->tp.call)); + seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + + for (i = 0; i < tu->tp.nr_args; i++) + seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); + + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = cleanup_all_probes(); + if (ret) + return ret; + } + + return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); +} + +static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_uprobe *tu = v; + + seq_printf(m, " %s %-44s %15lu\n", tu->filename, + ftrace_event_name(&tu->tp.call), tu->nhit); + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations uprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +struct uprobe_cpu_buffer { + struct mutex mutex; + void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ + int cpu, err_cpu; + + uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); + if (uprobe_cpu_buffer == NULL) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct page *p = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL, 0); + if (p == NULL) { + err_cpu = cpu; + goto err; + } + per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); + mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); + } + + return 0; + +err: + for_each_possible_cpu(cpu) { + if (cpu == err_cpu) + break; + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); + } + + free_percpu(uprobe_cpu_buffer); + return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ + int ret = 0; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (uprobe_buffer_refcnt++ == 0) { + ret = uprobe_buffer_init(); + if (ret < 0) + uprobe_buffer_refcnt--; + } + + return ret; +} + +static void uprobe_buffer_disable(void) +{ + int cpu; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (--uprobe_buffer_refcnt == 0) { + for_each_possible_cpu(cpu) + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, + cpu)->buf); + + free_percpu(uprobe_cpu_buffer); + uprobe_cpu_buffer = NULL; + } +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ + struct uprobe_cpu_buffer *ucb; + int cpu; + + cpu = raw_smp_processor_id(); + ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + + /* + * Use per-cpu buffers for fastest access, but we might migrate + * so the mutex makes sure we have sole access to it. + */ + mutex_lock(&ucb->mutex); + + return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ + mutex_unlock(&ucb->mutex); +} + +static void __uprobe_trace_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize, + struct ftrace_event_file *ftrace_file) +{ + struct uprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + void *data; + int size, esize; + struct ftrace_event_call *call = &tu->tp.call; + + WARN_ON(call != ftrace_file->event_call); + + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = esize + tu->tp.size + dsize; + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, size, 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + memcpy(data, ucb->buf, tu->tp.size + dsize); + + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); +} + +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct event_file_link *link; + + if (is_ret_probe(tu)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + rcu_read_unlock(); + + return 0; +} + +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct event_file_link *link; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + rcu_read_unlock(); +} + +/* Event entry printers */ +static enum print_line_t +print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) +{ + struct uprobe_trace_entry_head *entry; + struct trace_seq *s = &iter->seq; + struct trace_uprobe *tu; + u8 *data; + int i; + + entry = (struct uprobe_trace_entry_head *)iter->ent; + tu = container_of(event, struct trace_uprobe, tp.call.event); + + if (is_ret_probe(tu)) { + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[1], entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + if (!trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, false); + } + + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + if (!parg->type->print(s, parg->name, data + parg->offset, entry)) + goto partial; + } + + if (trace_seq_puts(s, "\n")) + return TRACE_TYPE_HANDLED; + +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, + filter_func_t filter) +{ + bool enabled = trace_probe_is_enabled(&tu->tp); + struct event_file_link *link = NULL; + int ret; + + if (file) { + if (tu->tp.flags & TP_FLAG_PROFILE) + return -EINTR; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + list_add_tail_rcu(&link->list, &tu->tp.files); + + tu->tp.flags |= TP_FLAG_TRACE; + } else { + if (tu->tp.flags & TP_FLAG_TRACE) + return -EINTR; + + tu->tp.flags |= TP_FLAG_PROFILE; + } + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + if (enabled) + return 0; + + ret = uprobe_buffer_enable(); + if (ret) + goto err_flags; + + tu->consumer.filter = filter; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) + goto err_buffer; + + return 0; + + err_buffer: + uprobe_buffer_disable(); + + err_flags: + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else { + tu->tp.flags &= ~TP_FLAG_PROFILE; + } + return ret; +} + +static void +probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) +{ + if (!trace_probe_is_enabled(&tu->tp)) + return; + + if (file) { + struct event_file_link *link; + + link = find_event_file_link(&tu->tp, file); + if (!link) + return; + + list_del_rcu(&link->list); + /* synchronize with u{,ret}probe_trace_func */ + synchronize_sched(); + kfree(link); + + if (!list_empty(&tu->tp.files)) + return; + } + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; + + uprobe_buffer_disable(); +} + +static int uprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i, size; + struct uprobe_trace_entry_head field; + struct trace_uprobe *tu = event_call->data; + + if (is_ret_probe(tu)) { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); + size = SIZEOF_TRACE_ENTRY(true); + } else { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); + } + /* Set argument names as fields */ + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, size + parg->offset, + parg->type->size, parg->type->is_signed, + FILTER_OTHER); + + if (ret) + return ret; + } + return 0; +} + +#ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + bool done; + + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) { + list_del(&event->hw.tp_list); + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { + tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } + write_unlock(&tu->filter.rwlock); + + if (!done) + return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + + return 0; +} + +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + bool done; + int err; + + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); + } else { + done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; + } + write_unlock(&tu->filter.rwlock); + + err = 0; + if (!done) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) + uprobe_perf_close(tu, event); + } + return err; +} + +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + +static void __uprobe_perf_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct ftrace_event_call *call = &tu->tp.call; + struct uprobe_trace_entry_head *entry; + struct hlist_head *head; + void *data; + int size, esize; + int rctx; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + size = esize + tu->tp.size + dsize; + size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; + + preempt_disable(); + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + goto out; + + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + if (!entry) + goto out; + + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + memcpy(data, ucb->buf, tu->tp.size + dsize); + + if (size - esize > tu->tp.size + dsize) { + int len = tu->tp.size + dsize; + + memset(data + len, 0, size - esize - len); + } + + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + out: + preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + + if (!is_ret_probe(tu)) + __uprobe_perf_func(tu, 0, regs, ucb, dsize); + return 0; +} + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + __uprobe_perf_func(tu, func, regs, ucb, dsize); +} +#endif /* CONFIG_PERF_EVENTS */ + +static int +trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, + void *data) +{ + struct trace_uprobe *tu = event->data; + struct ftrace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(tu, file, NULL); + + case TRACE_REG_UNREGISTER: + probe_event_disable(tu, file); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_event_enable(tu, NULL, uprobe_perf_filter); + + case TRACE_REG_PERF_UNREGISTER: + probe_event_disable(tu, NULL); + return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + +#endif + default: + return 0; + } + return 0; +} + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + int ret = 0; + + + tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; + + udd.tu = tu; + udd.bp_addr = instruction_pointer(regs); + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + if (tu->tp.flags & TP_FLAG_TRACE) + ret |= uprobe_trace_func(tu, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (tu->tp.flags & TP_FLAG_PROFILE) + ret |= uprobe_perf_func(tu, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); + return ret; +} + +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + tu = container_of(con, struct trace_uprobe, consumer); + + udd.tu = tu; + udd.bp_addr = func; + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + if (tu->tp.flags & TP_FLAG_TRACE) + uretprobe_trace_func(tu, func, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (tu->tp.flags & TP_FLAG_PROFILE) + uretprobe_perf_func(tu, func, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); + return 0; +} + +static struct trace_event_functions uprobe_funcs = { + .trace = print_uprobe_event +}; + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + struct ftrace_event_call *call = &tu->tp.call; + int ret; + + /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &uprobe_funcs; + call->class->define_fields = uprobe_event_define_fields; + + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) + return -ENOMEM; + + ret = register_ftrace_event(&call->event); + if (!ret) { + kfree(call->print_fmt); + return -ENODEV; + } + call->flags = 0; + call->class->reg = trace_uprobe_register; + call->data = tu; + ret = trace_add_event_call(call); + + if (ret) { + pr_info("Failed to register uprobe event: %s\n", + ftrace_event_name(call)); + kfree(call->print_fmt); + unregister_ftrace_event(&call->event); + } + + return ret; +} + +static int unregister_uprobe_event(struct trace_uprobe *tu) +{ + int ret; + + /* tu->event is unregistered in trace_remove_event_call() */ + ret = trace_remove_event_call(&tu->tp.call); + if (ret) + return ret; + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; + return 0; +} + +/* Make a trace interface for controling probe points */ +static __init int init_uprobe_trace(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("uprobe_events", 0644, d_tracer, + NULL, &uprobe_events_ops); + /* Profile interface */ + trace_create_file("uprobe_profile", 0444, d_tracer, + NULL, &uprobe_profile_ops); + return 0; +} + +fs_initcall(init_uprobe_trace); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a472..00000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Workqueue statistical tracer. - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - - -#include <trace/events/workqueue.h> -#include <linux/list.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/kref.h> -#include "trace_stat.h" -#include "trace.h" - - -/* A cpu workqueue thread */ -struct cpu_workqueue_stats { - struct list_head list; - struct kref kref; - int cpu; - pid_t pid; -/* Can be inserted from interrupt or user context, need to be atomic */ - atomic_t inserted; -/* - * Don't need to be atomic, works are serialized in a single workqueue thread - * on a single CPU. - */ - unsigned int executed; -}; - -/* List of workqueue threads on one cpu */ -struct workqueue_global_stats { - struct list_head list; - spinlock_t lock; -}; - -/* Don't need a global lock because allocated before the workqueues, and - * never freed. - */ -static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); -#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) - -static void cpu_workqueue_stat_free(struct kref *kref) -{ - kfree(container_of(kref, struct cpu_workqueue_stats, kref)); -} - -/* Insertion of a work */ -static void -probe_workqueue_insertion(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - atomic_inc(&node->inserted); - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Execution of a work */ -static void -probe_workqueue_execution(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - node->executed++; - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(void *ignore, - struct task_struct *wq_thread, int cpu) -{ - struct cpu_workqueue_stats *cws; - unsigned long flags; - - WARN_ON(cpu < 0); - - /* Workqueues are sometimes created in atomic context */ - cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); - if (!cws) { - pr_warning("trace_workqueue: not enough memory\n"); - return; - } - INIT_LIST_HEAD(&cws->list); - kref_init(&cws->kref); - cws->cpu = cpu; - cws->pid = wq_thread->pid; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Destruction of a cpu workqueue thread */ -static void -probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) -{ - /* Workqueue only execute on one cpu */ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { - if (node->pid == wq_thread->pid) { - list_del(&node->list); - kref_put(&node->kref, cpu_workqueue_stat_free); - goto found; - } - } - - pr_debug("trace_workqueue: don't find workqueue to destroy\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -} - -static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) -{ - unsigned long flags; - struct cpu_workqueue_stats *ret = NULL; - - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { - ret = list_entry(workqueue_cpu_stat(cpu)->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static void *workqueue_stat_start(struct tracer_stat *trace) -{ - int cpu; - void *ret = NULL; - - for_each_possible_cpu(cpu) { - ret = workqueue_stat_start_cpu(cpu); - if (ret) - return ret; - } - return NULL; -} - -static void *workqueue_stat_next(void *prev, int idx) -{ - struct cpu_workqueue_stats *prev_cws = prev; - struct cpu_workqueue_stats *ret; - int cpu = prev_cws->cpu; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - do { - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - return NULL; - } while (!(ret = workqueue_stat_start_cpu(cpu))); - return ret; - } else { - ret = list_entry(prev_cws->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static int workqueue_stat_show(struct seq_file *s, void *p) -{ - struct cpu_workqueue_stats *cws = p; - struct pid *pid; - struct task_struct *tsk; - - pid = find_get_pid(cws->pid); - if (pid) { - tsk = get_pid_task(pid, PIDTYPE_PID); - if (tsk) { - seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, - atomic_read(&cws->inserted), cws->executed, - tsk->comm); - put_task_struct(tsk); - } - put_pid(pid); - } - - return 0; -} - -static void workqueue_stat_release(void *stat) -{ - struct cpu_workqueue_stats *node = stat; - - kref_put(&node->kref, cpu_workqueue_stat_free); -} - -static int workqueue_stat_headers(struct seq_file *s) -{ - seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n"); - return 0; -} - -struct tracer_stat workqueue_stats __read_mostly = { - .name = "workqueues", - .stat_start = workqueue_stat_start, - .stat_next = workqueue_stat_next, - .stat_show = workqueue_stat_show, - .stat_release = workqueue_stat_release, - .stat_headers = workqueue_stat_headers -}; - - -int __init stat_workqueue_init(void) -{ - if (register_stat_tracer(&workqueue_stats)) { - pr_warning("Unable to register workqueue stat tracer\n"); - return 1; - } - - return 0; -} -fs_initcall(stat_workqueue_init); - -/* - * Workqueues are created very early, just after pre-smp initcalls. - * So we must register our tracepoints at this stage. - */ -int __init trace_workqueue_early_init(void) -{ - int ret, cpu; - - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - - ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); - if (ret) - goto out; - - ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); - if (ret) - goto no_insertion; - - ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); - if (ret) - goto no_execution; - - ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); - if (ret) - goto no_creation; - - return 0; - -no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); -no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); -no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -out: - pr_warning("trace_workqueue: unable to trace workqueues\n"); - - return 1; -} -early_initcall(trace_workqueue_early_init); |
