diff options
| author | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 15:06:28 +0900 |
|---|---|---|
| committer | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 15:06:28 +0900 |
| commit | f43dc23d5ea91fca257be02138a255f02d98e806 (patch) | |
| tree | b29722f6e965316e90ac97abf79923ced250dc21 /kernel/trace | |
| parent | f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff) | |
| parent | 4162cf64973df51fc885825bc9ca4d055891c49f (diff) | |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts:
arch/sh/kernel/cpu/sh2/setup-sh7619.c
arch/sh/kernel/cpu/sh2a/setup-mxg.c
arch/sh/kernel/cpu/sh2a/setup-sh7201.c
arch/sh/kernel/cpu/sh2a/setup-sh7203.c
arch/sh/kernel/cpu/sh2a/setup-sh7206.c
arch/sh/kernel/cpu/sh3/setup-sh7705.c
arch/sh/kernel/cpu/sh3/setup-sh770x.c
arch/sh/kernel/cpu/sh3/setup-sh7710.c
arch/sh/kernel/cpu/sh3/setup-sh7720.c
arch/sh/kernel/cpu/sh4/setup-sh4-202.c
arch/sh/kernel/cpu/sh4/setup-sh7750.c
arch/sh/kernel/cpu/sh4/setup-sh7760.c
arch/sh/kernel/cpu/sh4a/setup-sh7343.c
arch/sh/kernel/cpu/sh4a/setup-sh7366.c
arch/sh/kernel/cpu/sh4a/setup-sh7722.c
arch/sh/kernel/cpu/sh4a/setup-sh7723.c
arch/sh/kernel/cpu/sh4a/setup-sh7724.c
arch/sh/kernel/cpu/sh4a/setup-sh7763.c
arch/sh/kernel/cpu/sh4a/setup-sh7770.c
arch/sh/kernel/cpu/sh4a/setup-sh7780.c
arch/sh/kernel/cpu/sh4a/setup-sh7785.c
arch/sh/kernel/cpu/sh4a/setup-sh7786.c
arch/sh/kernel/cpu/sh4a/setup-shx3.c
arch/sh/kernel/cpu/sh5/setup-sh5.c
drivers/serial/sh-sci.c
drivers/serial/sh-sci.h
include/linux/serial_sci.h
Diffstat (limited to 'kernel/trace')
40 files changed, 9416 insertions, 5378 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1551f47e766..14674dce77a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,38 +11,48 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool + help + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACER bool + help + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_TRACER bool + help + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - This gets selected when the arch tests the function_trace_stop - variable at the mcount call site. Otherwise, this variable - is tested by the called function. + See Documentation/trace/ftrace-design.txt config HAVE_DYNAMIC_FTRACE bool + help + See Documentation/trace/ftrace-design.txt config HAVE_FTRACE_MCOUNT_RECORD bool + help + See Documentation/trace/ftrace-design.txt -config HAVE_HW_BRANCH_TRACER +config HAVE_SYSCALL_TRACEPOINTS bool + help + See Documentation/trace/ftrace-design.txt -config HAVE_FTRACE_SYSCALLS +config HAVE_C_RECORDMCOUNT bool + help + C version of recordmcount available? config TRACER_MAX_TRACE bool @@ -59,16 +69,36 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool +config EVENT_POWER_TRACING_DEPRECATED + depends on EVENT_TRACING + bool "Deprecated power event trace API, to be removed" + default y + help + Provides old power event types: + C-state/idle accounting events: + power:power_start + power:power_end + and old cpufreq accounting event: + power:power_frequency + This is for userspace compatibility + and will vanish after 5 kernel iterations, + namely 2.6.41. + config CONTEXT_SWITCH_TRACER - select MARKERS bool +config RING_BUFFER_ALLOW_SWAP + bool + help + Allow the use of ring_buffer_swap_cpu. + Adds a very slight overhead to tracing when enabled. + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hiding of the automatic options. config TRACING bool @@ -104,21 +134,21 @@ menuconfig FTRACE bool "Tracers" default y if DEBUG_KERNEL help - Enable the kernel tracing infrastructure. + Enable the kernel tracing infrastructure. if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER + select FRAME_POINTER if !ARM_UNWIND && !S390 select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation - instruction to the beginning of every kernel function, which NOP + instruction at the beginning of every kernel function, which NOP sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very @@ -135,7 +165,7 @@ config FUNCTION_GRAPH_TRACER and its entry. Its first purpose is to trace the duration of functions and draw a call graph for each thread with some information like - the return value. This is done by setting the current return + the return value. This is done by setting the current return address on the current task structure into a stack of calls. @@ -143,10 +173,11 @@ config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -157,19 +188,20 @@ config IRQSOFF_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be used together or separately.) config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP help - This option measures the time spent in preemption off critical + This option measures the time spent in preemption-off critical sections, with microsecond accuracy. The default measurement method is a maximum search, which is @@ -178,19 +210,10 @@ config PREEMPT_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. - config SCHED_TRACER bool "Scheduling Latency Tracer" select GENERIC_TRACER @@ -205,35 +228,18 @@ config ENABLE_DEFAULT_TRACERS depends on !GENERIC_TRACER select TRACING help - This tracer hooks to various trace points in the kernel + This tracer hooks to various trace points in the kernel, allowing the user to pick and choose which trace point they want to trace. It also includes the sched_switch tracer plugin. config FTRACE_SYSCALLS bool "Trace syscalls" - depends on HAVE_FTRACE_SYSCALLS + depends on HAVE_SYSCALL_TRACEPOINTS select GENERIC_TRACER select KALLSYMS help Basic tracer to catch the syscall entry and exit events. -config BOOT_TRACER - bool "Trace boot initcalls" - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer helps developers to optimize boot times: it records - the timings of the initcalls and traces key events and the identity - of tasks that can cause boot delays, such as context-switches. - - Its aim is to be parsed by the /scripts/bootgraph.pl tool to - produce pretty graphics about boot inefficiencies, giving a visual - representation of the delays during initcalls - but the raw - /debug/tracing/trace text output is readable too. - - You must pass in ftrace=initcall to the kernel command line - to enable this on bootup. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -248,19 +254,19 @@ choice The likely/unlikely profiler only looks at the conditions that are annotated with a likely or unlikely macro. - The "all branch" profiler will profile every if statement in the + The "all branch" profiler will profile every if-statement in the kernel. This profiler will also enable the likely/unlikely - profiler as well. + profiler. - Either of the above profilers add a bit of overhead to the system. - If unsure choose "No branch profiling". + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". config BRANCH_PROFILE_NONE bool "No branch profiling" help - No branch profiling. Branch profiling adds a bit of overhead. - Only enable it if you want to analyse the branching behavior. - Otherwise keep it disabled. + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" @@ -271,7 +277,7 @@ config PROFILE_ANNOTATED_BRANCHES /sys/kernel/debug/tracing/profile_annotated_branch - Note: this will add a significant overhead, only turn this + Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES @@ -288,7 +294,7 @@ config PROFILE_ALL_BRANCHES This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system - is to be analyzed + is to be analyzed in much detail. endchoice config TRACING_BRANCHES @@ -313,16 +319,6 @@ config BRANCH_TRACER Say N if unsure. -config POWER_TRACER - bool "Trace power consumption behavior" - depends on X86 - select GENERIC_TRACER - help - This tracer helps developers to analyze and optimize the kernels - power management decisions, specifically the C-state and P-state - behavior. - - config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER @@ -347,47 +343,8 @@ config STACK_TRACER Say N if unsure. -config HW_BRANCH_TRACER - depends on HAVE_HW_BRANCH_TRACER - bool "Trace hw branches" - select GENERIC_TRACER - help - This tracer records all branches on the system in a circular - buffer giving access to the last N branches for each cpu. - -config KMEMTRACE - bool "Trace SLAB allocations" - select GENERIC_TRACER - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/trace/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - -config WORKQUEUE_TRACER - bool "Trace workqueues" - select GENERIC_TRACER - help - The workqueue tracer provides some statistical informations - about each cpu workqueue thread such as the number of the - works inserted and executed since their creation. It can help - to evaluate the amount of work each of them have to perform. - For example it can help a developer to decide whether he should - choose a per cpu workqueue instead of a singlethreaded one. - config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" + bool "Support for tracing block IO actions" depends on SYSFS depends on BLOCK select RELAY @@ -411,38 +368,55 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_EVENT + depends on KPROBES + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replaces them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replace them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. - This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise - has native performance as long as no tracing is active. + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. A file is created - in debugfs called function_profile_enabled which defaults to zero. - When a 1 is echoed into this file profiling begins, and when a - zero is entered, profiling stops. A file in the trace_stats - directory called functions, that show the list of functions that - have been hit and their counters. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. - If in doubt, say N + If in doubt, say N. config FTRACE_MCOUNT_RECORD def_bool y @@ -462,6 +436,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI @@ -489,8 +475,8 @@ config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER help - This option creates a test to stress the ring buffer and bench mark it. - It creates its own ring buffer such that it will not interfer with + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with any other users of the ring buffer (such as ftrace). It then creates a producer and consumer that will run for 10 seconds and sleep for 10 seconds. Each interval it will print out the number of events @@ -499,7 +485,7 @@ config RING_BUFFER_BENCHMARK It does not disable interrupts or raise its priority, so it may be affected by processes that are running. - If unsure, say N + If unsure, say N. endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90..761c510a06c 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o -obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o @@ -38,12 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o -obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) @@ -52,7 +47,14 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o -obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 39af8af6fc3..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/time.h> #include <linux/uaccess.h> @@ -64,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { + buffer = blk_tr->buffer; pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, 0, pc); if (!event) @@ -95,7 +98,7 @@ record_it: memcpy((void *) t + sizeof(*t), data, len); if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); } } @@ -165,9 +168,11 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +#define BLK_TC_RAHEAD BLK_TC_AHEAD + /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) +#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -178,6 +183,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; @@ -189,9 +195,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNCIO); - what |= MASK_TC_BIT(rw, AHEAD); + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); @@ -203,8 +208,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); + buffer = blk_tr->buffer; pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, 0, pc); if (!event) @@ -251,7 +257,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, event, 0, pc); + trace_buffer_unlock_commit(buffer, event, 0, pc); return; } } @@ -266,8 +272,8 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - debugfs_remove(bt->dir); relay_close(bt->rchan); + debugfs_remove(bt->dir); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -317,6 +323,7 @@ static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = blk_dropped_open, .read = blk_dropped_read, + .llseek = default_llseek, }; static int blk_msg_open(struct inode *inode, struct file *filp) @@ -356,6 +363,7 @@ static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = blk_msg_open, .write = blk_msg_write, + .llseek = noop_llseek, }; /* @@ -377,18 +385,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); return 0; } @@ -545,13 +543,49 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return ret; - if (copy_to_user(arg, &buts, sizeof(buts))) + if (copy_to_user(arg, &buts, sizeof(buts))) { + blk_trace_remove(q); return -EFAULT; - + } return 0; } EXPORT_SYMBOL_GPL(blk_trace_setup); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) +static int compat_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct compat_blk_user_trace_setup cbuts; + int ret; + + if (copy_from_user(&cbuts, arg, sizeof(cbuts))) + return -EFAULT; + + buts = (struct blk_user_trace_setup) { + .act_mask = cbuts.act_mask, + .buf_size = cbuts.buf_size, + .buf_nr = cbuts.buf_nr, + .start_lba = cbuts.start_lba, + .end_lba = cbuts.end_lba, + .pid = cbuts.pid, + }; + memcpy(&buts.name, &cbuts.name, 32); + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts.name, 32)) { + blk_trace_remove(q); + return -EFAULT; + } + + return 0; +} +#endif + int blk_trace_startstop(struct request_queue *q, int start) { int ret; @@ -611,6 +645,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) bdevname(bdev, b); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) + case BLKTRACESETUP32: + bdevname(bdev, b); + ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#endif case BLKTRACESTART: start = 1; case BLKTRACESTOP: @@ -664,10 +704,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (likely(!bt)) return; - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); + if (rq->cmd_flags & REQ_DISCARD) + rw |= REQ_DISCARD; + + if (rq->cmd_flags & REQ_SECURE) + rw |= REQ_SECURE; - if (blk_pc_request(rq)) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, what, rq->errors, rq->cmd_len, rq->cmd); @@ -678,28 +721,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, } } -static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_abort(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_ABORT); } -static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_insert(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_INSERT); } -static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_issue(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_ISSUE); } -static void blk_add_trace_rq_requeue(struct request_queue *q, +static void blk_add_trace_rq_requeue(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(struct request_queue *q, +static void blk_add_trace_rq_complete(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); @@ -727,34 +775,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); } -static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_bounce(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); } -static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); } -static void blk_add_trace_bio_backmerge(struct request_queue *q, +static void blk_add_trace_bio_backmerge(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); } -static void blk_add_trace_bio_frontmerge(struct request_queue *q, +static void blk_add_trace_bio_frontmerge(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); } -static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_queue(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_QUEUE); } -static void blk_add_trace_getrq(struct request_queue *q, +static void blk_add_trace_getrq(void *ignore, + struct request_queue *q, struct bio *bio, int rw) { if (bio) @@ -768,7 +822,8 @@ static void blk_add_trace_getrq(struct request_queue *q, } -static void blk_add_trace_sleeprq(struct request_queue *q, +static void blk_add_trace_sleeprq(void *ignore, + struct request_queue *q, struct bio *bio, int rw) { if (bio) @@ -782,7 +837,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q, } } -static void blk_add_trace_plug(struct request_queue *q) +static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -790,7 +845,7 @@ static void blk_add_trace_plug(struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(struct request_queue *q) +static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -803,7 +858,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q) } } -static void blk_add_trace_unplug_timer(struct request_queue *q) +static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -816,7 +871,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q) } } -static void blk_add_trace_split(struct request_queue *q, struct bio *bio, +static void blk_add_trace_split(void *ignore, + struct request_queue *q, struct bio *bio, unsigned int pdu) { struct blk_trace *bt = q->blk_trace; @@ -832,6 +888,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, /** * blk_add_trace_remap - Add a trace for a remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio * @dev: target device @@ -842,8 +899,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -861,6 +919,39 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, } /** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(void *ignore, + struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for * @rq: io request @@ -880,7 +971,7 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (blk_pc_request(rq)) + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, rq->errors, len, data); else @@ -893,61 +984,64 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split); + ret = register_trace_block_remap(blk_add_trace_remap, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); } static void blk_unregister_tracepoints(void) { - unregister_trace_block_remap(blk_add_trace_remap); - unregister_trace_block_split(blk_add_trace_split); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); - unregister_trace_block_plug(blk_add_trace_plug); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq); - unregister_trace_block_getrq(blk_add_trace_getrq); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_split(blk_add_trace_split, NULL); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); + unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + unregister_trace_block_getrq(blk_add_trace_getrq, NULL); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } @@ -1290,7 +1384,7 @@ out: } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { return print_one_line(iter, false); } @@ -1312,7 +1406,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) } static enum print_line_t -blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +blk_trace_event_print_binary(struct trace_iterator *iter, int flags, + struct trace_event *event) { return blk_trace_synthesize_old_trace(iter) ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; @@ -1350,12 +1445,16 @@ static struct tracer blk_tracer __read_mostly = { .set_flag = blk_tracer_set_flag, }; -static struct trace_event trace_blk_event = { - .type = TRACE_BLK, +static struct trace_event_functions trace_blk_event_funcs = { .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .funcs = &trace_blk_event_funcs, +}; + static int __init init_blk_tracer(void) { if (!register_ftrace_event(&trace_blk_event)) { @@ -1550,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct block_device *bdev; ssize_t ret = -ENXIO; - lock_kernel(); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1581,8 +1679,7 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); +out: return ret; } @@ -1612,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = -ENXIO; - lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); if (bdev == NULL) - goto out_unlock_kernel; + goto out; q = blk_trace_get_queue(bdev); if (q == NULL) @@ -1651,8 +1747,6 @@ out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); -out_unlock_kernel: - unlock_kernel(); out: return ret ? ret : count; } @@ -1662,6 +1756,11 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING @@ -1672,7 +1771,7 @@ void blk_dump_cmd(char *buf, struct request *rq) int len = rq->cmd_len; unsigned char *cmd = rq->cmd; - if (!blk_pc_request(rq)) { + if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { buf[0] = '\0'; return; } @@ -1697,21 +1796,21 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & WRITE) rwbs[i++] = 'W'; - else if (rw & 1 << BIO_RW_DISCARD) + else if (rw & REQ_DISCARD) rwbs[i++] = 'D'; else if (bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (rw & 1 << BIO_RW_AHEAD) + if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & 1 << BIO_RW_BARRIER) - rwbs[i++] = 'B'; - if (rw & 1 << BIO_RW_SYNCIO) + if (rw & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & 1 << BIO_RW_META) + if (rw & REQ_META) rwbs[i++] = 'M'; + if (rw & REQ_SECURE) + rwbs[i++] = 'E'; rwbs[i] = '\0'; } @@ -1721,8 +1820,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq) int rw = rq->cmd_flags & 0x03; int bytes; - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); + if (rq->cmd_flags & REQ_DISCARD) + rw |= REQ_DISCARD; + + if (rq->cmd_flags & REQ_SECURE) + rw |= REQ_SECURE; bytes = blk_rq_bytes(rq); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3718d55fb4c..f3dadae8388 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,12 +22,13 @@ #include <linux/hardirq.h> #include <linux/kthread.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> #include <linux/ftrace.h> #include <linux/sysctl.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/hash.h> +#include <linux/rcupdate.h> #include <trace/events/sched.h> @@ -60,6 +61,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -78,18 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +/* + * Traverse the ftrace_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { - struct ftrace_ops *op = ftrace_list; - - /* in case someone actually ports this to alpha! */ - read_barrier_depends(); + struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ while (op != &ftrace_list_end) { - /* silly alpha */ - read_barrier_depends(); op->func(ip, parent_ip); - op = op->next; + op = rcu_dereference_raw(op->next); /*see above*/ }; } @@ -144,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - smp_wmb(); - ftrace_list = ops; + rcu_assign_pointer(ftrace_list, ops); if (ftrace_enabled) { ftrace_func_t func; @@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void) if (ftrace_trace_function == ftrace_stub) return; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST func = ftrace_trace_function; +#else + func = __ftrace_trace_function; +#endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -249,6 +264,7 @@ struct ftrace_profile { unsigned long counter; #ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned long long time; + unsigned long long time_squared; #endif }; @@ -291,7 +307,9 @@ function_stat_next(void *v, int idx) pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: - rec++; + if (idx != 0) + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { pg = pg->next; if (!pg) @@ -349,9 +367,9 @@ static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " Function " - "Hit Time Avg\n" + "Hit Time Avg s^2\n" " -------- " - "--- ---- ---\n"); + "--- ---- --- ---\n"); #else seq_printf(m, " Function Hit\n" " -------- ---\n"); @@ -363,11 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static DEFINE_MUTEX(mutex); static struct trace_seq s; unsigned long long avg; + unsigned long long stddev; #endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); @@ -377,17 +403,31 @@ static int function_stat_show(struct seq_file *m, void *v) avg = rec->time; do_div(avg, rec->counter); - mutex_lock(&mutex); + /* Sample standard deviation (s^2) */ + if (rec->counter <= 1) + stddev = 0; + else { + stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Divide only 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide 1000 again. + */ + do_div(stddev, (rec->counter - 1) * 1000); + } + trace_seq_init(&s); trace_print_graph_duration(rec->time, &s); trace_seq_puts(&s, " "); trace_print_graph_duration(avg, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(stddev, &s); trace_print_seq(m, &s); - mutex_unlock(&mutex); #endif seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); - return 0; + return ret; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -633,6 +673,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!stat->hash || !ftrace_profile_enabled) goto out; + /* If the calltime was zero'd ignore it */ + if (!trace->calltime) + goto out; + calltime = trace->rettime - trace->calltime; if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { @@ -651,8 +695,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) } rec = ftrace_find_profiled_func(stat, trace->func); - if (rec) + if (rec) { rec->time += calltime; + rec->time_squared += calltime * calltime; + } out: local_irq_restore(flags); @@ -734,7 +780,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -754,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = { .open = tracing_open_generic, .read = ftrace_profile_read, .write = ftrace_profile_write, + .llseek = default_llseek, }; /* used to initialize the real stat files */ @@ -766,7 +813,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -784,7 +831,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) * The files created are permanent, if something happens * we still do not free memory. */ - kfree(stat); WARN(1, "Could not allocate stat file for cpu %d\n", cpu); @@ -811,13 +857,11 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) } #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -841,10 +885,8 @@ enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), }; static int ftrace_filtered; @@ -884,36 +926,6 @@ static struct dyn_ftrace *ftrace_free_records; } \ } -#ifdef CONFIG_KPROBES - -static int frozen_record_count; - -static inline void freeze_record(struct dyn_ftrace *rec) -{ - if (!(rec->flags & FTRACE_FL_FROZEN)) { - rec->flags |= FTRACE_FL_FROZEN; - frozen_record_count++; - } -} - -static inline void unfreeze_record(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_FROZEN) { - rec->flags &= ~FTRACE_FL_FROZEN; - frozen_record_count--; - } -} - -static inline int record_frozen(struct dyn_ftrace *rec) -{ - return rec->flags & FTRACE_FL_FROZEN; -} -#else -# define freeze_record(rec) ({ 0; }) -# define unfreeze_record(rec) ({ 0; }) -# define record_frozen(rec) ({ 0; }) -#endif /* CONFIG_KPROBES */ - static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1011,75 +1023,54 @@ static void ftrace_bug(int failed, unsigned long ip) } +/* Return 1 if the address range is reserved for ftrace */ +int ftrace_text_reserved(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip <= (unsigned long)end && + rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; - unsigned long ip, fl; + unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; - ip = rec->ip; - /* - * If this record is not to be traced and - * it is not enabled then do nothing. + * If this record is not to be traced or we want to disable it, + * then disable it. * - * If this record is not to be traced and - * it is enabled then disable it. + * If we want to enable it and filtering is off, then enable it. * + * If we want to enable it and filtering is on, enable it only if + * it's filtered */ - if (rec->flags & FTRACE_FL_NOTRACE) { - if (rec->flags & FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - return 0; - - } else if (ftrace_filtered && enable) { - /* - * Filtering is on: - */ - - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - - /* Record is filtered and enabled, do nothing */ - if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) - return 0; - - /* Record is not filtered or enabled, do nothing */ - if (!fl) - return 0; - - /* Record is not filtered but enabled, disable it */ - if (fl == FTRACE_FL_ENABLED) - rec->flags &= ~FTRACE_FL_ENABLED; - else - /* Otherwise record is filtered but not enabled, enable it */ - rec->flags |= FTRACE_FL_ENABLED; - } else { - /* Disable or not filtered */ - - if (enable) { - /* if record is enabled, do nothing */ - if (rec->flags & FTRACE_FL_ENABLED) - return 0; - - rec->flags |= FTRACE_FL_ENABLED; - - } else { + if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + flag = FTRACE_FL_ENABLED; + } - /* if record is not enabled, do nothing */ - if (!(rec->flags & FTRACE_FL_ENABLED)) - return 0; + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return 0; - rec->flags &= ~FTRACE_FL_ENABLED; - } + if (flag) { + rec->flags |= FTRACE_FL_ENABLED; + return ftrace_make_call(rec, ftrace_addr); } - if (rec->flags & FTRACE_FL_ENABLED) - return ftrace_make_call(rec, ftrace_addr); - else - return ftrace_make_nop(NULL, rec, ftrace_addr); + rec->flags &= ~FTRACE_FL_ENABLED; + return ftrace_make_nop(NULL, rec, ftrace_addr); } static void ftrace_replace_code(int enable) @@ -1098,25 +1089,12 @@ static void ftrace_replace_code(int enable) !(rec->flags & FTRACE_FL_CONVERTED)) continue; - /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) { - freeze_record(rec); - continue; - } else { - unfreeze_record(rec); - } - failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; } } while_for_each_ftrace_rec(); } @@ -1247,8 +1225,6 @@ static void ftrace_shutdown(int command) static void ftrace_startup_sysctl(void) { - int command = FTRACE_ENABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; @@ -1256,23 +1232,17 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_ENABLE_CALLS); } static void ftrace_shutdown_sysctl(void) { - int command = FTRACE_DISABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_DISABLE_CALLS); } static cycle_t ftrace_update_time; @@ -1297,12 +1267,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -1358,36 +1350,38 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), - FTRACE_ITER_PRINTALL = (1 << 4), - FTRACE_ITER_HASH = (1 << 5), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_FAILURES = (1 << 2), + FTRACE_ITER_PRINTALL = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; - unsigned filtered; + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1410,35 +1404,51 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; + if (iter->func_pos > *pos) + return NULL; + + iter->hidx = 0; + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); + if (!p) + break; + } + if (!p) + return NULL; + + /* Only set this if we have an item */ iter->flags |= FTRACE_ITER_HASH; - return t_hash_next(m, p, pos); + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - char str[KSYM_SYMBOL_LEN]; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); - - kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); - seq_printf(m, "%s", str); + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1454,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1467,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; - } else { - iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -1490,16 +1499,36 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return t_hash_start(m, pos); + + iter->func_pos = *pos; + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; mutex_lock(&ftrace_lock); /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all * functions are enabled. @@ -1508,26 +1537,35 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; - (*pos)++; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); - if (*pos > 0) { - if (iter->idx < 0) - return p; - (*pos)--; - iter->idx--; + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; } - p = t_next(m, p, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - if (!p) - return t_hash_start(m, pos); + return NULL; + } - return p; + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1538,28 +1576,27 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%ps\n", (void *)rec->ip); return 0; } -static struct seq_operations show_ftrace_seq_ops = { +static const struct seq_operations show_ftrace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -1593,17 +1630,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -int ftrace_avail_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_iterator *iter = m->private; - - seq_release(inode, file); - kfree(iter); - - return 0; -} - static int ftrace_failures_open(struct inode *inode, struct file *file) { @@ -1613,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } @@ -1652,9 +1678,14 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!iter) return -ENOMEM; + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { @@ -1666,8 +1697,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!ret) { struct seq_file *m = file->private_data; m->private = iter; - } else + } else { + trace_parser_put(&iter->parser); kfree(iter); + } } else file->private_data = iter; mutex_unlock(&ftrace_regex_lock); @@ -1700,64 +1733,10 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; - char *ptr; + int slen; switch (type) { case MATCH_FULL: @@ -1773,8 +1752,8 @@ static int ftrace_match(char *str, char *regex, int len, int type) matched = 1; break; case MATCH_END_ONLY: - ptr = strstr(str, regex); - if (ptr && (ptr[len] == 0)) + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) matched = 1; break; } @@ -1791,7 +1770,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) return ftrace_match(str, regex, len, type); } -static void ftrace_match_records(char *buff, int len, int enable) +static int ftrace_match_records(char *buff, int len, int enable) { unsigned int search_len; struct ftrace_page *pg; @@ -1800,9 +1779,10 @@ static void ftrace_match_records(char *buff, int len, int enable) char *search; int type; int not; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1817,6 +1797,7 @@ static void ftrace_match_records(char *buff, int len, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } /* * Only enable filtering if we have a function that @@ -1826,6 +1807,8 @@ static void ftrace_match_records(char *buff, int len, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } static int @@ -1847,7 +1830,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, return 1; } -static void ftrace_match_module_records(char *buff, char *mod, int enable) +static int ftrace_match_module_records(char *buff, char *mod, int enable) { unsigned search_len = 0; struct ftrace_page *pg; @@ -1856,6 +1839,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) char *search = buff; unsigned long flag; int not = 0; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; @@ -1870,7 +1854,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1886,12 +1870,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } /* @@ -1920,8 +1907,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return -EINVAL; - ftrace_match_module_records(func, mod, enable); - return 0; + if (ftrace_match_module_records(func, mod, enable)) + return 0; + return -EINVAL; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -1942,7 +1930,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) struct hlist_head *hhd; struct hlist_node *n; unsigned long key; - int resched; key = hash_long(ip, FTRACE_HASH_BITS); @@ -1956,12 +1943,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) * period. This syncs the hash iteration and freeing of items * on the hash. rcu_read_lock is too dangerous here. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); hlist_for_each_entry_rcu(entry, n, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_probe_ops __read_mostly = @@ -2035,7 +2022,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2107,12 +2094,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int i, len = 0; char *search; - if (glob && (strcmp(glob, "*") || !strlen(glob))) + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; - else { + else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2218,8 +2205,9 @@ static int ftrace_process_regex(char *buff, int len, int enable) func = strsep(&next, ":"); if (!next) { - ftrace_match_records(func, len, enable); - return 0; + if (ftrace_match_records(func, len, enable)) + return 0; + return ret; } /* command found */ @@ -2244,11 +2232,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; + struct trace_parser *parser; + ssize_t ret, read; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&ftrace_regex_lock); @@ -2259,66 +2246,20 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - - if (!(iter->flags & ~FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); - if (isspace(ch)) { - file->f_pos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(parser->buffer, + parser->idx, enable); + trace_parser_clear(parser); if (ret) - goto out; - read++; - cnt--; + goto out_unlock; } - if (isspace(ch)) { - iter->filtered++; - iter->buffer[iter->buffer_idx] = 0; - ret = ftrace_process_regex(iter->buffer, - iter->buffer_idx, enable); - if (ret) - goto out; - iter->buffer_idx = 0; - } else - iter->flags |= FTRACE_ITER_CONT; - - - file->f_pos += read; - ret = read; - out: +out_unlock: mutex_unlock(&ftrace_regex_lock); return ret; @@ -2402,6 +2343,34 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); + +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2418,6 +2387,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int @@ -2425,6 +2398,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct trace_parser *parser; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2434,10 +2408,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } else iter = file->private_data; - if (iter->buffer_idx) { - iter->filtered++; - iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(parser->buffer, parser->idx, enable); } mutex_lock(&ftrace_lock); @@ -2445,7 +2419,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); + trace_parser_put(parser); kfree(iter); + mutex_unlock(&ftrace_regex_lock); return 0; } @@ -2466,14 +2442,14 @@ static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_failures_fops = { .open = ftrace_failures_open, .read = seq_read, .llseek = seq_lseek, - .release = ftrace_avail_release, + .release = seq_release_private, }; static const struct file_operations ftrace_filter_fops = { @@ -2497,35 +2473,33 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; +int ftrace_graph_filter_enabled; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * -g_next(struct seq_file *m, void *v, loff_t *pos) +__g_next(struct seq_file *m, loff_t *pos) { - unsigned long *array = m->private; - int index = *pos; - - (*pos)++; - - if (index >= ftrace_graph_count) + if (*pos >= ftrace_graph_count) return NULL; + return &ftrace_graph_funcs[*pos]; +} - return &array[index]; +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); } static void *g_start(struct seq_file *m, loff_t *pos) { - void *p = NULL; - mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_count && !*pos) + if (!ftrace_graph_filter_enabled && !*pos) return (void *)1; - p = g_next(m, p, pos); - - return p; + return __g_next(m, pos); } static void g_stop(struct seq_file *m, void *p) @@ -2536,7 +2510,6 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { unsigned long *ptr = v; - char str[KSYM_SYMBOL_LEN]; if (!ptr) return 0; @@ -2546,14 +2519,12 @@ static int g_show(struct seq_file *m, void *v) return 0; } - kallsyms_lookup(*ptr, NULL, NULL, NULL, str); - - seq_printf(m, "%s\n", str); + seq_printf(m, "%ps\n", (void *)*ptr); return 0; } -static struct seq_operations ftrace_graph_seq_ops = { +static const struct seq_operations ftrace_graph_seq_ops = { .start = g_start, .next = g_next, .stop = g_stop, @@ -2570,31 +2541,34 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { + ftrace_graph_filter_enabled = 0; ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } + mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) { + if (file->f_mode & FMODE_READ) ret = seq_open(file, &ftrace_graph_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ftrace_graph_funcs; - } - } else - file->private_data = ftrace_graph_funcs; - mutex_unlock(&graph_lock); return ret; } static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { struct dyn_ftrace *rec; struct ftrace_page *pg; int search_len; - int found = 0; + int fail = 1; int type, not; char *search; bool exists; @@ -2604,122 +2578,99 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); - if (not) - return -EINVAL; + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); + if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + return -EBUSY; search_len = strlen(search); mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) - break; - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) continue; if (ftrace_match_record(rec, search, search_len, type)) { - /* ensure it is not already in the array */ + /* if it is in the array */ exists = false; - for (i = 0; i < *idx; i++) + for (i = 0; i < *idx; i++) { if (array[i] == rec->ip) { exists = true; break; } - if (!exists) { - array[(*idx)++] = rec->ip; - found = 1; + } + + if (!not) { + fail = 0; + if (!exists) { + array[(*idx)++] = rec->ip; + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + goto out; + } + } else { + if (exists) { + array[i] = array[--(*idx)]; + array[*idx] = 0; + fail = 0; + } } } } while_for_each_ftrace_rec(); - +out: mutex_unlock(&ftrace_lock); - return found ? 0 : -EINVAL; + if (fail) + return -EINVAL; + + ftrace_graph_filter_enabled = 1; + return 0; } static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned long *array; - size_t read = 0; - ssize_t ret; - int index = 0; - char ch; + struct trace_parser parser; + ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&graph_lock); - if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { - ret = -EBUSY; - goto out; + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; + goto out_unlock; } - if (file->f_mode & FMODE_READ) { - struct seq_file *m = file->private_data; - array = m->private; - } else - array = file->private_data; + read = trace_get_user(&parser, ubuf, cnt, ppos); - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; + if (read >= 0 && trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + parser.buffer); if (ret) - goto out; - read++; - cnt--; - } - - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - while (cnt && !isspace(ch)) { - if (index < FTRACE_BUFF_MAX) - buffer[index++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; + goto out_free; } - buffer[index] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, buffer); - if (ret) - goto out; - - file->f_pos += read; ret = read; - out: + +out_free: + trace_parser_put(&parser); +out_unlock: mutex_unlock(&graph_lock); return ret; } static const struct file_operations ftrace_graph_fops = { - .open = ftrace_graph_open, - .read = seq_read, - .write = ftrace_graph_write, + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .release = ftrace_graph_release, + .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -2747,7 +2698,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2780,19 +2731,17 @@ static int ftrace_convert_nops(struct module *mod, } #ifdef CONFIG_MODULES -void ftrace_release(void *start, void *end) +void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = (unsigned long)end; - if (ftrace_disabled || !start || start == end) + if (ftrace_disabled) return; mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { + if (within_module_core(rec->ip, mod)) { /* * rec->ip is changed in ftrace_free_rec() * It should not between s and e if record was freed. @@ -2809,7 +2758,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2824,9 +2773,7 @@ static int ftrace_module_notify(struct notifier_block *self, mod->num_ftrace_callsites); break; case MODULE_STATE_GOING: - ftrace_release(mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + ftrace_release_mod(mod); break; } @@ -2872,7 +2819,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -2905,23 +2852,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2972,14 +2902,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2990,74 +2918,184 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } -static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int ftrace_pid_add(int p) { struct pid *pid; - char buf[64]; - long val; - int ret; + struct ftrace_pid *fpid; + int ret = -EINVAL; - if (cnt >= sizeof(buf)) - return -EINVAL; + mutex_lock(&ftrace_lock); - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); - buf[cnt] = 0; + if (!pid) + goto out; - ret = strict_strtol(buf, 10, &val); - if (ret < 0) - return ret; + ret = 0; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; - clear_ftrace_pid_task(&ftrace_pid_trace); + ret = -ENOMEM; - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); + set_ftrace_pid_task(pid); - if (!pid) - goto out; + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; - ftrace_pid_trace = pid; + clear_ftrace_pid_task(pid); - set_ftrace_pid_task(ftrace_pid_trace); + list_del(&fpid->list); + kfree(fpid); } - /* update the function call */ ftrace_update_pid_func(); ftrace_startup_enable(0); - out: mutex_unlock(&ftrace_lock); +} - return cnt; +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64], *tmp; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + tmp = strstrip(buf); + if (strlen(tmp) == 0) + return 1; + + ret = strict_strtol(tmp, 10, &val); + if (ret < 0) + return ret; + + ret = ftrace_pid_add(val); + + return ret ? ret : cnt; +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) @@ -3140,7 +3178,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3150,12 +3188,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = ftrace_enabled; + last_ftrace_enabled = !!ftrace_enabled; if (ftrace_enabled) { @@ -3243,8 +3281,8 @@ free: } static void -ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, - struct task_struct *next) +ftrace_graph_probe_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; int index; @@ -3298,7 +3336,7 @@ static int start_graph_tracing(void) } while (ret == -EAGAIN); if (!ret) { - ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); if (ret) pr_info("ftrace_graph: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); @@ -3370,11 +3408,11 @@ void unregister_ftrace_graph(void) goto out; ftrace_graph_active--; - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); out: mutex_unlock(&ftrace_lock); @@ -3385,6 +3423,7 @@ void ftrace_graph_init_task(struct task_struct *t) { /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; + t->curr_ret_stack = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; @@ -3394,7 +3433,6 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; @@ -3420,4 +3458,3 @@ void ftrace_graph_stop(void) ftrace_stop(); } #endif - diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index 1edaa9516e8..00000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null @@ -1,468 +0,0 @@ -/* - * Memory allocator tracing - * - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - */ - -#include <linux/tracepoint.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/dcache.h> -#include <linux/fs.h> - -#include <linux/kmemtrace.h> - -#include "trace_output.h" -#include "trace.h" - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_KMEM_OPT_MINIMAL 0x1 - -static struct tracer_opt kmem_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, - { } -}; - -static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts -}; - -static struct trace_array *kmemtrace_array; - -/* Trace allocations */ -static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ftrace_event_call *call = &event_kmem_alloc; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_alloc_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_ALLOC; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static inline void kmemtrace_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ftrace_event_call *call = &event_kmem_free; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_free_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static void kmemtrace_kmalloc(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmem_cache_alloc(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmalloc_node(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kfree(unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); -} - -static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); -} - -static int kmemtrace_start_probes(void) -{ - int err; - - err = register_trace_kmalloc(kmemtrace_kmalloc); - if (err) - return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); - if (err) - return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); - if (err) - return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); - if (err) - return err; - err = register_trace_kfree(kmemtrace_kfree); - if (err) - return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - unregister_trace_kmalloc(kmemtrace_kmalloc); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); - unregister_trace_kfree(kmemtrace_kfree); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); -} - -static int kmem_trace_init(struct trace_array *tr) -{ - int cpu; - kmemtrace_array = tr; - - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); - - kmemtrace_start_probes(); - - return 0; -} - -static void kmem_trace_reset(struct trace_array *tr) -{ - kmemtrace_stop_probes(); -} - -static void kmemtrace_headers(struct seq_file *s) -{ - /* Don't need headers for the original kmemtrace output */ - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return; - - seq_printf(s, "#\n"); - seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " - " POINTER NODE CALLER\n"); - seq_printf(s, "# FREE | | | | " - " | | | |\n"); - seq_printf(s, "# |\n\n"); -} - -/* - * The following functions give the original output from kmemtrace, - * plus the origin CPU, since reordering occurs in-kernel now. - */ - -#define KMEMTRACE_USER_ALLOC 0 -#define KMEMTRACE_USER_FREE 1 - -struct kmemtrace_user_event { - u8 event_id; - u8 type_id; - u16 event_size; - u32 cpu; - u64 timestamp; - unsigned long call_site; - unsigned long ptr; -}; - -struct kmemtrace_user_event_alloc { - size_t bytes_req; - size_t bytes_alloc; - unsigned gfp_flags; - int node; -}; - -static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) -{ - struct kmemtrace_user_event_alloc *ev_alloc; - struct trace_seq *s = &iter->seq; - struct kmemtrace_user_event *ev; - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_ALLOC; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); - if (!ev_alloc) - return TRACE_TYPE_PARTIAL_LINE; - - ev_alloc->bytes_req = entry->bytes_req; - ev_alloc->bytes_alloc = entry->bytes_alloc; - ev_alloc->gfp_flags = entry->gfp_flags; - ev_alloc->node = entry->node; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_user_event *ev; - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_FREE; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - return TRACE_TYPE_HANDLED; -} - -/* The two other following provide a more minimalistic output */ -static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) -{ - struct trace_seq *s = &iter->seq; - int ret; - - /* Alloc entry */ - ret = trace_seq_printf(s, " + "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Requested */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Allocated */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Flags - * TODO: would be better to see the name of the GFP flag names - */ - ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Node */ - ret = trace_seq_printf(s, "%4d ", entry->node); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) -{ - struct trace_seq *s = &iter->seq; - int ret; - - /* Free entry */ - ret = trace_seq_printf(s, " - "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip requested/allocated/flags */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip node */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Call site */ - ret = seq_print_ip_sym(s, entry->call_site, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - if (!trace_seq_printf(s, "\n")) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_KMEM_ALLOC: { - struct kmemtrace_alloc_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_alloc_compress(iter, field); - else - return kmemtrace_print_alloc_user(iter, field); - } - - case TRACE_KMEM_FREE: { - struct kmemtrace_free_entry *field; - - trace_assign_type(field, entry); - if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) - return kmemtrace_print_free_compress(iter, field); - else - return kmemtrace_print_free_user(iter, field); - } - - default: - return TRACE_TYPE_UNHANDLED; - } -} - -static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags -}; - -void kmemtrace_init(void) -{ - /* earliest opportunity to start kmem tracing */ -} - -static int __init init_kmem_tracer(void) -{ - return register_tracer(&kmem_tracer); -} -device_initcall(init_kmem_tracer); diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 00000000000..f55fcf61b22 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/power.h> + +#ifdef EVENT_POWER_TRACING_DEPRECATED +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +#endif +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04dac263825..bd1c35a4fbc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -14,12 +14,14 @@ #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/list.h> #include <linux/cpu.h> #include <linux/fs.h> +#include <asm/local.h> #include "trace.h" /* @@ -201,13 +203,19 @@ int tracing_is_on(void) } EXPORT_SYMBOL_GPL(tracing_is_on); -#include "trace.h" - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -216,19 +224,17 @@ enum { RB_LEN_TIME_STAMP = 16, }; -static inline int rb_null_event(struct ring_buffer_event *event) -{ - return event->type_len == RINGBUF_TYPE_PADDING - && event->time_delta == 0; -} +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) -static inline int rb_discarded_event(struct ring_buffer_event *event) +static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { + /* padding has a NULL time_delta */ event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -245,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) return length + RB_EVNT_HDR_SIZE; } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { @@ -271,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) return 0; } +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - unsigned length = rb_event_length(event); + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; @@ -291,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) @@ -316,20 +356,49 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/* Flag when events were overwritten */ +#define RB_MISSED_EVENTS (1 << 31) +/* Missed count stored at end */ +#define RB_MISSED_STORED (1 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ unsigned char data[]; /* data of buffer page */ }; +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ struct buffer_page { struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ + unsigned long real_end; /* real end of data */ struct buffer_data_page *page; /* Actual data page */ }; +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + static void rb_init_page(struct buffer_data_page *bpage) { local_set(&bpage->commit, 0); @@ -372,27 +441,33 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) - int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + ret = trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + 1, + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } @@ -402,25 +477,26 @@ int ring_buffer_print_page_header(struct trace_seq *s) */ struct ring_buffer_per_cpu { int cpu; + atomic_t record_disabled; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ - raw_spinlock_t lock; + spinlock_t reader_lock; /* serialize readers */ + arch_spinlock_t lock; struct lock_class_key lock_key; - struct list_head pages; + struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; - unsigned long nmi_dropped; - unsigned long commit_overrun; - unsigned long overrun; - unsigned long read; + unsigned long lost_events; + unsigned long last_overrun; + local_t commit_overrun; + local_t overrun; local_t entries; local_t committing; local_t commits; + unsigned long read; u64 write_stamp; u64 read_stamp; - atomic_t record_disabled; }; struct ring_buffer { @@ -446,24 +522,31 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; /* buffer may be either ring_buffer or ring_buffer_per_cpu */ -#define RB_WARN_ON(buffer, cond) \ - ({ \ - int _____ret = unlikely(cond); \ - if (_____ret) { \ - atomic_inc(&buffer->record_disabled); \ - WARN_ON(1); \ - } \ - _____ret; \ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ }) /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -474,7 +557,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -489,6 +572,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the given page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static int inline +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, (unsigned long)&new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. + */ + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -498,14 +965,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + rb_head_page_deactivate(cpu_buffer); + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) return -1; if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) return -1; + if (rb_check_list(cpu_buffer, head)) + return -1; + list_for_each_entry_safe(bpage, tmp, head, list) { if (RB_WARN_ON(cpu_buffer, bpage->list.next->prev != &bpage->list)) @@ -513,25 +985,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, bpage->list.prev->next != &bpage->list)) return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; } + rb_head_page_activate(cpu_buffer); + return 0; } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) { - struct list_head *head = &cpu_buffer->pages; struct buffer_page *bpage, *tmp; unsigned long addr; LIST_HEAD(pages); unsigned i; + WARN_ON(!nr_pages); + for (i = 0; i < nr_pages; i++) { bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, &pages); addr = __get_free_page(GFP_KERNEL); @@ -541,7 +1021,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } - list_splice(&pages, head); + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); rb_check_pages(cpu_buffer); @@ -572,14 +1058,15 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); - cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - INIT_LIST_HEAD(&cpu_buffer->pages); + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) goto fail_free_buffer; + rb_check_bpage(cpu_buffer, bpage); + cpu_buffer->reader_page = bpage; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -594,9 +1081,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) goto fail_free_reader; cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_head_page_activate(cpu_buffer); + return cpu_buffer; fail_free_reader: @@ -609,15 +1098,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = &cpu_buffer->pages; + struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; free_buffer_page(cpu_buffer->reader_page); - list_for_each_entry_safe(bpage, tmp, head, list) { - list_del_init(&bpage->list); + rb_head_page_deactivate(cpu_buffer); + + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); free_buffer_page(bpage); } + kfree(cpu_buffer); } @@ -735,6 +1231,7 @@ ring_buffer_free(struct ring_buffer *buffer) put_online_cpus(); + kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); kfree(buffer); @@ -756,26 +1253,25 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) - return; - p = cpu_buffer->pages.next; + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) + goto out; + p = cpu_buffer->pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); free_buffer_page(bpage); } - if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) - return; + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) + goto out; rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); - atomic_dec(&cpu_buffer->record_disabled); - +out: + spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -786,22 +1282,22 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - return; + goto out; p = pages->next; bpage = list_entry(p, struct buffer_page, list); list_del_init(&bpage->list); - list_add_tail(&bpage->list, &cpu_buffer->pages); + list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); - atomic_dec(&cpu_buffer->record_disabled); +out: + spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -809,11 +1305,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, * @buffer: the buffer to resize. * @size: the new size. * - * The tracer is responsible for making sure that the buffer is - * not being used while changing the size. - * Note: We may be able to change the above requirement by using - * RCU synchronizations. - * * Minimum size is 2 * BUF_PAGE_SIZE. * * Returns -1 on failure. @@ -845,6 +1336,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size == buffer_size) return size; + atomic_inc(&buffer->record_disabled); + + /* Make sure all writers are done with this buffer. */ + synchronize_sched(); + mutex_lock(&buffer->mutex); get_online_cpus(); @@ -907,6 +1403,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); + return size; free_pages: @@ -916,6 +1414,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -ENOMEM; /* @@ -925,6 +1424,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) out_fail: put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -1; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -948,21 +1448,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) } static inline struct ring_buffer_event * -rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) -{ - return __rb_page_index(cpu_buffer->head_page, - cpu_buffer->head_page->read); -} - -static inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_write(struct buffer_page *bpage) +static inline unsigned long rb_page_write(struct buffer_page *bpage) { - return local_read(&bpage->write); + return local_read(&bpage->write) & RB_WRITE_MASK; } static inline unsigned rb_page_commit(struct buffer_page *bpage) @@ -970,6 +1463,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) return local_read(&bpage->page->commit); } +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + /* Size is determined by what has been commited */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -982,22 +1480,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) -{ - return rb_page_commit(cpu_buffer->head_page); -} - -static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page **bpage) -{ - struct list_head *p = (*bpage)->list.next; - - if (p == &cpu_buffer->pages) - p = p->next; - - *bpage = list_entry(p, struct buffer_page, list); -} - static inline unsigned rb_event_index(struct ring_buffer_event *event) { @@ -1023,6 +1505,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { + unsigned long max_count; + /* * We only race with interrupts and NMIs on this CPU. * If we own the commit event, then we can commit @@ -1032,9 +1516,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: + max_count = cpu_buffer->buffer->pages * 100; + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); cpu_buffer->write_stamp = cpu_buffer->commit_page->page->time_stamp; @@ -1043,8 +1534,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) } while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { - cpu_buffer->commit_page->page->commit = - cpu_buffer->commit_page->write; + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); barrier(); } @@ -1077,7 +1572,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * to the head page instead of next. */ if (iter->head_page == cpu_buffer->reader_page) - iter->head_page = cpu_buffer->head_page; + iter->head_page = rb_set_head_page(cpu_buffer); else rb_inc_page(cpu_buffer, &iter->head_page); @@ -1085,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + /** * ring_buffer_update_event - update event type and data * @event: the even to update @@ -1097,28 +1611,188 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * data field. */ static void -rb_update_event(struct ring_buffer_event *event, - unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) { - event->type_len = type; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; + + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; + } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); - case RINGBUF_TYPE_PADDING: - case RINGBUF_TYPE_TIME_EXTEND: - case RINGBUF_TYPE_TIME_STAMP: + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ break; - case 0: - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) - event->array[0] = length; - else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; default: - BUG(); + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; } static unsigned rb_calculate_event_length(unsigned length) @@ -1129,11 +1803,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length = 1; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); return length; } @@ -1150,6 +1824,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * must fill the old tail_page with padding. */ if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + local_sub(length, &tail_page->write); return; } @@ -1158,6 +1840,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, kmemcheck_annotate_bitfield(event, bitfield); /* + * Save the original length to the meta data. + * This will be used by the reader to add lost event + * counter. + */ + tail_page->real_end = tail; + + /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the * write counter enough to allow another writer to slip @@ -1184,111 +1873,108 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; - /* Account for this as an entry */ - local_inc(&tail_page->entries); - local_inc(&cpu_buffer->entries); /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *commit_page, - struct buffer_page *tail_page, u64 *ts) + struct buffer_page *tail_page, u64 ts) { - struct buffer_page *next_page, *head_page, *reader_page; + struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; - bool lock_taken = false; - unsigned long flags; + struct buffer_page *next_page; + int ret; next_page = tail_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - rb_inc_page(cpu_buffer, &next_page); - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - /* * If for some reason, we had an interrupt storm that made * it all the way around the buffer, bail, and warn * about it. */ if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; + local_inc(&cpu_buffer->commit_overrun); goto out_reset; } - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } } } - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; } - rb_reset_tail(cpu_buffer, tail_page, tail, length); + out_again: - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + rb_reset_tail(cpu_buffer, tail_page, tail, length); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -1297,48 +1983,52 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* reset write */ rb_reset_tail(cpu_buffer, tail_page, tail, length); - if (likely(lock_taken)) - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); return NULL; } static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) + unsigned long length, u64 ts, + u64 delta, int add_timestamp) { - struct buffer_page *tail_page, *commit_page; + struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) + if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, - commit_page, tail_page, ts); + tail_page, ts); /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(event, type, length); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (!tail) - tail_page->page->time_stamp = *ts; + tail_page->page->time_stamp = ts; return event; } @@ -1353,19 +2043,23 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long addr; new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); + old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; bpage = cpu_buffer->tail_page; if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; /* * This is on the tail page. It is possible that * a write could come in and move the tail page * and write to the next page. That is fine * because we just shorten what is on this page. */ + old_index += write_mask; + new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); if (index == old_index) return 1; @@ -1375,80 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 *ts, u64 *delta) -{ - struct ring_buffer_event *event; - static int once; - int ret; - - if (unlikely(*delta > (1ULL << 59) && !once++)) { - printk(KERN_WARNING "Delta way too big! %llu" - " ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, - (unsigned long long)*ts, - (unsigned long long)cpu_buffer->write_stamp); - WARN_ON(1); - } - - /* - * The delta is too big, we to add a - * new timestamp. - */ - event = __rb_reserve_next(cpu_buffer, - RINGBUF_TYPE_TIME_EXTEND, - RB_LEN_TIME_EXTEND, - ts); - if (!event) - return -EBUSY; - - if (PTR_ERR(event) == -EAGAIN) - return -EAGAIN; - - /* Only a commited time event can update the write stamp */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * If this is the first on the page, then it was - * updated with the page itself. Try to discard it - * and if we can't just make it zero. - */ - if (rb_event_index(event)) { - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; - } else { - /* try to discard, since we do not need this */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - } - cpu_buffer->write_stamp = *ts; - /* let the caller know this was the commit */ - ret = 1; - } else { - /* Try to discard the event */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - } - ret = 0; - } - - *delta = 0; - - return ret; -} - static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -1481,18 +2108,38 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) } static struct ring_buffer_event * -rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta = 0; - int commit = 0; + u64 ts, delta; int nr_loops = 0; + int add_timestamp; + u64 diff; rb_start_commit(cpu_buffer); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + length = rb_calculate_event_length(length); again: + add_timestamp = 0; + delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -1505,57 +2152,33 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; - /* - * Only the first commit can update the timestamp. - * Yes there is a race here. If an interrupt comes in - * just after the conditional and it traces too, then it - * will also check the deltas. More than one timestamp may - * also be made. But only the entry that did the actual - * commit will be something other than zero. - */ - if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer))) { - u64 diff; - - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (unlikely(ts < cpu_buffer->write_stamp)) - goto get_event; + /* make sure this diff is calculated here */ + barrier(); + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { - - commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) - goto out_fail; - - if (commit == -EAGAIN) - goto again; - - RB_WARN_ON(cpu_buffer, commit < 0); + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp); + add_timestamp = 1; } } - get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) goto out_fail; - if (!rb_event_is_commit(cpu_buffer, event)) - delta = 0; - - event->time_delta = delta; - return event; out_fail: @@ -1563,15 +2186,13 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifdef CONFIG_TRACING + #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) { - current->trace_recursion++; - - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) - return 0; - /* Disable all tracing before we do anything else */ tracing_off_permanent(); @@ -1583,17 +2204,33 @@ static int trace_recursive_lock(void) in_nmi()); WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + return -1; } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void) { WARN_ON_ONCE(!current->trace_recursion); current->trace_recursion--; } -static DEFINE_PER_CPU(int, rb_need_resched); +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif /** * ring_buffer_lock_reserve - reserve a part of the buffer @@ -1615,16 +2252,16 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; - if (atomic_read(&buffer->record_disabled)) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); + + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; if (trace_recursive_lock()) goto out_nocheck; @@ -1642,41 +2279,54 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; - /* - * Need to store resched state on this cpu. - * Only the first needs to. - */ - - if (preempt_count() == 1) - per_cpu(rb_need_resched, cpu) = resched; - return event; out: trace_recursive_unlock(); out_nocheck: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - local_inc(&cpu_buffer->entries); + u64 delta; /* * The event first in the commit queue updates the * time stamp. */ - if (rb_event_is_commit(cpu_buffer, event)) - cpu_buffer->write_stamp += event->time_delta; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); rb_end_commit(cpu_buffer); } @@ -1701,13 +2351,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return 0; } @@ -1715,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; @@ -1723,32 +2370,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -/** - * ring_buffer_event_discard - discard any event in the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. */ -void ring_buffer_event_discard(struct ring_buffer_event *event) +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) { - rb_event_discard(event); + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(cpu_buffer, &bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(cpu_buffer, &bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); } -EXPORT_SYMBOL_GPL(ring_buffer_event_discard); /** * ring_buffer_commit_discard - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * - * This is similar to ring_buffer_event_discard but must only be - * performed on an event that has not been committed yet. The difference - * is that this will also try to free the event from the ring buffer + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the the item has been + * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * * If another event has been added behind it, it will set the event @@ -1776,26 +2448,21 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, */ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); - if (!rb_try_to_discard(cpu_buffer, event)) + rb_decrement_entry(cpu_buffer, event); + if (rb_try_to_discard(cpu_buffer, event)) goto out; /* * The commit is still visible by the reader, so we - * must increment entries. + * must still update the timestamp. */ - local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); out: rb_end_commit(cpu_buffer); trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); @@ -1821,15 +2488,15 @@ int ring_buffer_write(struct ring_buffer *buffer, struct ring_buffer_event *event; void *body; int ret = -EBUSY; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - if (atomic_read(&buffer->record_disabled)) - return -EBUSY; + preempt_disable_notrace(); - resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out; cpu = raw_smp_processor_id(); @@ -1844,7 +2511,7 @@ int ring_buffer_write(struct ring_buffer *buffer, if (length > BUF_MAX_DATA_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, length); + event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) goto out; @@ -1856,7 +2523,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return ret; } @@ -1865,9 +2532,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); struct buffer_page *commit = cpu_buffer->commit_page; + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + return reader->read == rb_page_commit(reader) && (commit == reader || (commit == head && @@ -1894,7 +2565,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct ring_buffer *buffer) { @@ -1930,7 +2601,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { @@ -1944,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -1952,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) - - cpu_buffer->read; - return ret; + return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -1979,33 +2660,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->overrun; + ret = local_read(&cpu_buffer->overrun); return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); /** - * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped - * @buffer: The ring buffer - * @cpu: The per CPU buffer to get the number of overruns from - */ -unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; - - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; - - cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->nmi_dropped; - - return ret; -} -EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); - -/** * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits * @buffer: The ring buffer * @cpu: The per CPU buffer to get the number of overruns from @@ -2020,7 +2681,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->commit_overrun; + ret = local_read(&cpu_buffer->commit_overrun); return ret; } @@ -2042,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += (local_read(&cpu_buffer->entries) - - cpu_buffer->overrun) - cpu_buffer->read; + entries += rb_num_of_entries(cpu_buffer); } return entries; @@ -2051,7 +2711,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer @@ -2066,7 +2726,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - overruns += cpu_buffer->overrun; + overruns += local_read(&cpu_buffer->overrun); } return overruns; @@ -2079,8 +2739,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) /* Iterator usage is expected to have record disabled */ if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = cpu_buffer->head_page; - iter->head = cpu_buffer->head_page->read; + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; } else { iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -2089,6 +2751,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -2195,11 +2859,13 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long overwrite; unsigned long flags; int nr_loops = 0; + int ret; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); again: /* @@ -2230,39 +2896,83 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto out; /* - * Splice the empty reader page into the list around the head. * Reset the reader page to size zero. */ - - reader = cpu_buffer->head_page; - cpu_buffer->reader_page->list.next = reader->list.next; - cpu_buffer->reader_page->list.prev = reader->list.prev; - local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->real_end = 0; + + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); + cpu_buffer->reader_page->list.prev = reader->list.prev; + + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidently swap it. + */ + cpu_buffer->pages = reader->list.prev; - /* Make the reader page now replace the head */ - reader->list.prev->next = &cpu_buffer->reader_page->list; - reader->list.next->prev = &cpu_buffer->reader_page->list; + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); /* - * If the tail is on the reader, then we must set the head - * to the inserted page, otherwise we set it one before. + * We want to make sure we read the overruns after we set up our + * pointers to the next object. The writer side does a + * cmpxchg to cross pages which acts as the mb on the writer + * side. Note, the reader will constantly fail the swap + * while the writer is updating the pointers, so this + * guarantees that the overwrite recorded here is the one we + * want to compare with the last_overrun. */ - cpu_buffer->head_page = cpu_buffer->reader_page; + smp_mb(); + overwrite = local_read(&(cpu_buffer->overrun)); - if (cpu_buffer->commit_page != reader) - rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); + + /* + * If we did not convert it, then we must try again. + */ + if (!ret) + goto spin; + + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; rb_reset_reader_page(cpu_buffer); + if (overwrite != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overwrite; + } + goto again; out: - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return reader; @@ -2282,8 +2992,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX - || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2294,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { - struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; unsigned length; cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; /* * Check if we are at the end of the buffer. @@ -2336,24 +3043,27 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) rb_advance_iter(iter); } +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->lost_events; +} + static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, + unsigned long *lost_events) { - struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; - cpu_buffer = buffer->buffers[cpu]; - again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written, or from discarded - * commits. The most that we can have is the number on a single page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -2374,7 +3084,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) * the box. Return the padding, and we will release * the current locks, and try again. */ - rb_advance_reader(cpu_buffer); return event; case RINGBUF_TYPE_TIME_EXTEND: @@ -2390,9 +3099,11 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } + if (lost_events) + *lost_events = rb_lost_events(cpu_buffer); return event; default: @@ -2411,27 +3122,39 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* - * We repeat when a timestamp is encountered. - * We can get multiple timestamps by nested interrupts or also - * if filtering is on (discarding commits). Since discarding - * commits can be frequent we can get a lot of timestamps. - * But we limit them by not adding timestamps if they begin - * at the start of a page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { @@ -2477,7 +3200,7 @@ static inline int rb_ok_to_lock(void) * buffer too. A one time deal is all you get from reading * the ring buffer from an NMI. */ - if (likely(!in_nmi() && !oops_in_progress)) + if (likely(!in_nmi())) return 1; tracing_off_permanent(); @@ -2489,12 +3212,14 @@ static inline int rb_ok_to_lock(void) * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. + * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -2509,15 +3234,15 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -2542,10 +3267,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } @@ -2553,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from + * @cpu: the cpu to read the buffer from + * @ts: a variable to store the timestamp (may be NULL) + * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; @@ -2580,13 +3307,12 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); - if (!event) - goto out_unlock; - - rb_advance_reader(cpu_buffer); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event) { + cpu_buffer->lost_events = 0; + rb_advance_reader(cpu_buffer); + } - out_unlock: if (dolock) spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); @@ -2594,33 +3320,38 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); + if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; - } return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_start - start a non consuming read of the buffer + * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * - * This starts up an iteration through the buffer. It also disables - * the recording to the buffer until the reading is finished. - * This prevents the reading from being corrupted. This is not - * a consuming read, so a producer is not expected. + * This performs the initial preparations necessary to iterate + * through the buffer. Memory is allocated, buffer recording + * is disabled, and the iterator pointer is returned to the caller. * - * Must be paired with ring_buffer_finish. + * Disabling buffer recordng prevents the reading from being + * corrupted. This is not a consuming read, so a producer is not + * expected. + * + * After a sequence of ring_buffer_read_prepare calls, the user is + * expected to make at least one call to ring_buffer_prepare_sync. + * Afterwards, ring_buffer_read_start is invoked to get things going + * for real. + * + * This overall must be paired with ring_buffer_finish. */ struct ring_buffer_iter * -ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; - unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; @@ -2634,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->record_disabled); + + return iter; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); + +/** + * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls + * + * All previously invoked ring_buffer_read_prepare calls to prepare + * iterators will be synchronized. Afterwards, read_buffer_read_start + * calls on those iterators are allowed. + */ +void +ring_buffer_read_prepare_sync(void) +{ synchronize_sched(); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @iter: The iterator returned by ring_buffer_read_prepare + * + * This finalizes the startup of an iteration through the buffer. + * The iterator comes from a call to ring_buffer_read_prepare and + * an intervening ring_buffer_read_prepare_sync must have been + * performed. + * + * Must be paired with ring_buffer_finish. + */ +void +ring_buffer_read_start(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -2677,21 +3445,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + again: event = rb_iter_peek(iter, ts); if (!event) goto out; + if (event->type_len == RINGBUF_TYPE_PADDING) + goto again; + rb_advance_iter(iter); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type_len == RINGBUF_TYPE_PADDING) { - cpu_relax(); - goto again; - } - return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); @@ -2709,8 +3475,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { + rb_head_page_deactivate(cpu_buffer); + cpu_buffer->head_page - = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + = list_entry(cpu_buffer->pages, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); @@ -2726,16 +3494,20 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; - cpu_buffer->nmi_dropped = 0; - cpu_buffer->commit_overrun = 0; - cpu_buffer->overrun = 0; - cpu_buffer->read = 0; + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + + cpu_buffer->lost_events = 0; + cpu_buffer->last_overrun = 0; + + rb_head_page_activate(cpu_buffer); } /** @@ -2755,12 +3527,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - __raw_spin_lock(&cpu_buffer->lock); + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); + out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); @@ -2843,6 +3619,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers * @buffer_a: One buffer to swap with @@ -2897,20 +3674,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_inc(&cpu_buffer_a->record_disabled); atomic_inc(&cpu_buffer_b->record_disabled); + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; cpu_buffer_b->buffer = buffer_a; cpu_buffer_a->buffer = buffer_b; + ret = 0; + +out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - - ret = 0; out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ /** * ring_buffer_alloc_read_page - allocate a page to read from buffer @@ -2997,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; + unsigned long missed_events; unsigned long flags; unsigned int commit; unsigned int read; @@ -3033,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = reader->read; commit = rb_page_commit(reader); + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -3053,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len > (commit - read)) len = (commit - read); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); if (len < size) goto out_unlock; @@ -3063,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; @@ -3071,9 +3868,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, rpos = reader->read; pos += size; + if (rpos >= commit) + break; + event = rb_reader_event(cpu_buffer); - size = rb_event_length(event); - } while (len > size); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); @@ -3083,7 +3884,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = 0; } else { /* update the entry counter */ - cpu_buffer->read += local_read(&reader->entries); + cpu_buffer->read += rb_page_entries(reader); /* swap the pages */ rb_init_page(bpage); @@ -3093,9 +3894,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; + + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); } ret = read; + cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); + /* + * Set a flag in the commit field if we lost events + */ + if (missed_events) { + /* If there is room at the end of the page to save the + * missed events, then record it there. + */ + if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -3104,6 +3938,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3155,6 +3990,7 @@ static const struct file_operations rb_simple_fops = { .open = tracing_open_generic, .read = rb_simple_read, .write = rb_simple_write, + .llseek = default_llseek, }; @@ -3171,6 +4007,7 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); +#endif #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 573d3cc762c..302f8a61463 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -8,6 +8,7 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/time.h> +#include <asm/local.h> struct rb_page { u64 ts; @@ -35,6 +36,28 @@ static int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = 19; +static int consumer_nice = 19; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + static int read_events; static int kill_test; @@ -58,7 +81,7 @@ static enum event_status read_event(int cpu) int *entry; u64 ts; - event = ring_buffer_consume(buffer, cpu, &ts); + event = ring_buffer_consume(buffer, cpu, &ts, NULL); if (!event) return EVENT_DROPPED; @@ -90,7 +113,8 @@ static enum event_status read_page(int cpu) ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); if (ret >= 0) { rpage = bpage; - commit = local_read(&rpage->commit); + /* The commit may have missed event flags set, clear them */ + commit = local_read(&rpage->commit) & 0xfffff; for (i = 0; i < commit && !kill_test; i += inc) { if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { @@ -208,15 +232,18 @@ static void ring_buffer_producer(void) do { struct ring_buffer_event *event; int *entry; - - event = ring_buffer_lock_reserve(buffer, 10); - if (!event) { - missed++; - } else { - hit++; - entry = ring_buffer_event_data(event); - *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } } do_gettimeofday(&end_tv); @@ -263,6 +290,27 @@ static void ring_buffer_producer(void) if (kill_test) trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == 19 && consumer_nice == 19) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + trace_printk("Time: %lld (usecs)\n", time); trace_printk("Overruns: %lld\n", overruns); if (disable_reader) @@ -392,6 +440,27 @@ static int __init ring_buffer_benchmark_init(void) if (IS_ERR(producer)) goto out_kill; + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); + return 0; out_kill: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 076fa6f0ee4..dc53ecb8058 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -12,7 +12,7 @@ * Copyright (C) 2004 William Lee Irwin III */ #include <linux/ring_buffer.h> -#include <linux/utsrelease.h> +#include <generated/utsrelease.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> @@ -31,10 +31,11 @@ #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> +#include <linux/rwsem.h> +#include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> -#include <linux/gfp.h> #include <linux/fs.h> #include "trace.h" @@ -42,14 +43,11 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -static int ring_buffer_expanded; +int ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -63,7 +61,7 @@ static bool __read_mostly tracing_selftest_running; /* * If a tracer is running, we do not want to run SELFTEST. */ -static bool __read_mostly tracing_selftest_disabled; +bool __read_mostly tracing_selftest_disabled; /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { @@ -88,27 +86,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - local_inc(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(ftrace_cpu_disabled); } static inline void ftrace_enable_cpu(void) { - local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(ftrace_cpu_disabled); preempt_enable(); } -static cpumask_var_t __read_mostly tracing_buffer_mask; - -/* Define which cpu buffers are currently read in trace_pipe */ -static cpumask_var_t tracing_reader_cpumask; - -#define for_each_tracing_cpu(cpu) \ - for_each_cpu(cpu, tracing_buffer_mask) +cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -121,30 +113,42 @@ static cpumask_var_t tracing_reader_cpumask; * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting - * /proc/sys/kernel/ftrace_dump_on_oops to true. + * /proc/sys/kernel/ftrace_dump_on_oops + * Set 1 if you want to dump buffers of all CPUs + * Set 2 if you want to dump the buffer of the CPU that triggered oops */ -int ftrace_dump_on_oops; + +enum ftrace_dump_mode ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); -#define BOOTUP_TRACER_SIZE 100 -static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { - ftrace_dump_on_oops = 1; - return 1; + if (*str++ != '=' || !*str) { + ftrace_dump_on_oops = DUMP_ALL; + return 1; + } + + if (!strcmp("orig_cpu", str)) { + ftrace_dump_on_oops = DUMP_ORIG; + return 1; + } + + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -171,10 +175,11 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); -int filter_current_check_discard(struct ftrace_event_call *call, void *rec, +int filter_current_check_discard(struct ring_buffer *buffer, + struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) { - return filter_check_discard(call, rec, global_trace.buffer, event); + return filter_check_discard(call, rec, buffer, event); } EXPORT_SYMBOL_GPL(filter_current_check_discard); @@ -204,7 +209,7 @@ cycle_t ftrace_now(int cpu) */ static struct trace_array max_tr; -static DEFINE_PER_CPU(struct trace_array_cpu, max_data); +static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; @@ -243,19 +248,91 @@ static struct tracer *trace_types __read_mostly; static struct tracer *current_trace __read_mostly; /* - * max_tracer_type_len is used to simplify the allocating of - * buffers to read userspace tracer names. We keep track of - * the longest tracer name registered. + * trace_types_lock is used to protect the trace_types list. */ -static int max_tracer_type_len; +static DEFINE_MUTEX(trace_types_lock); /* - * trace_types_lock is used to protect the trace_types list. - * This lock is also used to keep user access serialized. - * Accesses from userspace will grab this lock while userspace - * activities happen inside the kernel. + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewrited + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. */ -static DEFINE_MUTEX(trace_types_lock); + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); @@ -263,7 +340,10 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + +static int trace_stop_count; +static DEFINE_SPINLOCK(tracing_start_lock); /** * trace_wake_up - wake up tasks waiting for trace input @@ -273,30 +353,50 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | */ void trace_wake_up(void) { + int cpu; + + if (trace_flags & TRACE_ITER_BLOCK) + return; /* * The runqueue_is_locked() can fail, but this is the best we * have for now: */ - if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) + cpu = get_cpu(); + if (!runqueue_is_locked(cpu)) wake_up(&trace_wait); + put_cpu(); } static int __init set_buf_size(char *str) { unsigned long buf_size; - int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &buf_size); + buf_size = memparse(str, &str); /* nr_entries can not be zero */ - if (ret < 0 || buf_size == 0) + if (buf_size == 0) return 0; trace_buf_size = buf_size; return 1; } __setup("trace_buf_size=", set_buf_size); +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshhold; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &threshhold); + if (ret < 0) + return 0; + tracing_thresh = threshhold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; @@ -313,7 +413,6 @@ static const char *trace_options[] = { "bin", "block", "stacktrace", - "sched-tree", "trace_printk", "ftrace_preempt", "branch", @@ -323,49 +422,126 @@ static const char *trace_options[] = { "printk-msg-only", "context-info", "latency-format", - "global-clock", "sleep-time", "graph-time", + "record-cmd", NULL }; +static struct { + u64 (*func)(void); + const char *name; +} trace_clocks[] = { + { trace_clock_local, "local" }, + { trace_clock_global, "global" }, +}; + +int trace_clock_id; + /* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a raw_spinlock_t in order to help - * with performance when lockdep debugging is enabled. + * trace_parser_get_init - gets the buffer for trace parser */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} /* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * trace_parser_put - frees the buffer for trace parser */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +void trace_parser_put(struct trace_parser *parser) { - struct trace_array_cpu *data = tr->data[cpu]; + kfree(parser->buffer); +} - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); - data = max_tr.data[cpu]; - data->saved_latency = tracing_max_latency; + ret = get_user(ch, ubuf++); + if (ret) + goto out; - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); - data->pid = tsk->pid; - data->uid = task_uid(tsk); - data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - data->policy = tsk->policy; - data->rt_priority = tsk->rt_priority; + read++; + cnt--; - /* record this tasks comm */ - tracing_record_cmdline(tsk); + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size - 1) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; } ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) @@ -411,6 +587,57 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ +static arch_spinlock_t ftrace_max_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +unsigned long __read_mostly tracing_max_latency; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + struct trace_array_cpu *max_data; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + max_data = max_tr.data[cpu]; + max_data->saved_latency = tracing_max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + max_data->pid = tsk->pid; + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); +} + /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -425,18 +652,21 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct ring_buffer *buf = tr->buffer; + if (trace_stop_count) + return; + WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; max_tr.buffer = buf; - ftrace_disable_cpu(); - ring_buffer_reset(tr->buffer); - ftrace_enable_cpu(); - __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } /** @@ -452,21 +682,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; + if (trace_stop_count) + return; + WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); - ring_buffer_reset(max_tr.buffer); ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + */ + trace_array_printk(&max_tr, _THIS_IP_, + "Failed to swap buffers due to commit in progress\n"); + } + ftrace_enable_cpu(); - WARN_ON_ONCE(ret && ret != -EAGAIN); + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } +#endif /* CONFIG_TRACER_MAX_TRACE */ /** * register_tracer - register a tracer with the ftrace system. @@ -479,7 +728,6 @@ __releases(kernel_lock) __acquires(kernel_lock) { struct tracer *t; - int len; int ret = 0; if (!type->name) { @@ -487,13 +735,11 @@ __acquires(kernel_lock) return -1; } - /* - * When this gets called we hold the BKL which means that - * preemption is disabled. Various trace selftests however - * need to disable and enable preemption for successful tests. - * So we drop the BKL here and grab it after the tests again. - */ - unlock_kernel(); + if (strlen(type->name) >= MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -501,7 +747,7 @@ __acquires(kernel_lock) for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ - pr_info("Trace %s already registered\n", + pr_info("Tracer %s already registered\n", type->name); ret = -1; goto out; @@ -523,7 +769,6 @@ __acquires(kernel_lock) if (type->selftest && !tracing_selftest_disabled) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; - int i; /* * Run a selftest on this tracer. @@ -532,8 +777,7 @@ __acquires(kernel_lock) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - for_each_tracing_cpu(i) - tracing_reset(tr, i); + tracing_reset_online_cpus(tr); current_trace = type; /* the test is responsible for initializing and enabling */ @@ -546,8 +790,7 @@ __acquires(kernel_lock) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - for_each_tracing_cpu(i) - tracing_reset(tr, i); + tracing_reset_online_cpus(tr); printk(KERN_CONT "PASSED\n"); } @@ -555,9 +798,6 @@ __acquires(kernel_lock) type->next = trace_types; trace_types = type; - len = strlen(type->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; out: tracing_selftest_running = false; @@ -566,7 +806,7 @@ __acquires(kernel_lock) if (ret || !default_bootup_tracer) goto out_unlock; - if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) goto out_unlock; printk(KERN_INFO "Starting tracer '%s'\n", type->name); @@ -581,21 +821,19 @@ __acquires(kernel_lock) #endif out_unlock: - lock_kernel(); return ret; } void unregister_tracer(struct tracer *type) { struct tracer **t; - int len; mutex_lock(&trace_types_lock); for (t = &trace_types; *t; t = &(*t)->next) { if (*t == type) goto found; } - pr_info("Trace %s not registered\n", type->name); + pr_info("Tracer %s not registered\n", type->name); goto out; found: @@ -608,35 +846,46 @@ void unregister_tracer(struct tracer *type) current_trace->stop(&global_trace); current_trace = &nop_trace; } - - if (strlen(type->name) != max_tracer_type_len) - goto out; - - max_tracer_type_len = 0; - for (t = &trace_types; *t; t = &(*t)->next) { - len = strlen((*t)->name); - if (len > max_tracer_type_len) - max_tracer_type_len = len; - } - out: +out: mutex_unlock(&trace_types_lock); } -void tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct ring_buffer *buffer, int cpu) { ftrace_disable_cpu(); - ring_buffer_reset_cpu(tr->buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ftrace_enable_cpu(); } +void tracing_reset(struct trace_array *tr, int cpu) +{ + struct ring_buffer *buffer = tr->buffer; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + __tracing_reset(buffer, cpu); + + ring_buffer_record_enable(buffer); +} + void tracing_reset_online_cpus(struct trace_array *tr) { + struct ring_buffer *buffer = tr->buffer; int cpu; + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); + + ring_buffer_record_enable(buffer); } void tracing_reset_current(int cpu) @@ -655,7 +904,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -667,8 +916,10 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } -static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +int is_tracing_stopped(void) +{ + return trace_stop_count; +} /** * ftrace_off_permanent - disable all ftrace code permanently @@ -709,6 +960,8 @@ void tracing_start(void) goto out; } + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); buffer = global_trace.buffer; if (buffer) @@ -718,6 +971,8 @@ void tracing_start(void) if (buffer) ring_buffer_record_enable(buffer); + arch_spin_unlock(&ftrace_max_lock); + ftrace_start(); out: spin_unlock_irqrestore(&tracing_start_lock, flags); @@ -739,6 +994,9 @@ void tracing_stop(void) if (trace_stop_count++) goto out; + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -747,6 +1005,8 @@ void tracing_stop(void) if (buffer) ring_buffer_record_disable(buffer); + arch_spin_unlock(&ftrace_max_lock); + out: spin_unlock_irqrestore(&tracing_start_lock, flags); } @@ -766,7 +1026,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!__raw_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) return; idx = map_pid_to_cmdline[tsk->pid]; @@ -791,7 +1051,7 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); } void trace_find_cmdline(int pid, char comm[]) @@ -803,20 +1063,25 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; } preempt_disable(); - __raw_spin_lock(&trace_cmdline_lock); + arch_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) strcpy(comm, saved_cmdlines[map]); else strcpy(comm, "<...>"); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } @@ -837,7 +1102,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; + entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -848,15 +1113,17 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - int type, - unsigned long len, - unsigned long flags, int pc) +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) { struct ring_buffer_event *event; - event = ring_buffer_lock_reserve(tr->buffer, len); + event = ring_buffer_lock_reserve(buffer, len); if (event != NULL) { struct trace_entry *ent = ring_buffer_event_data(event); @@ -866,58 +1133,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, return event; } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, int skip, int pc); -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc); -static inline void __trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc, - int wake) +static inline void +__trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) { - ring_buffer_unlock_commit(tr->buffer, event); + ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); + ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_userstack(buffer, flags, pc); if (wake) trace_wake_up(); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { - __trace_buffer_unlock_commit(tr, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); } struct ring_buffer_event * -trace_current_buffer_lock_reserve(int type, unsigned long len, +trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, + int type, unsigned long len, unsigned long flags, int pc) { - return trace_buffer_lock_reserve(&global_trace, + *current_rb = global_trace.buffer; + return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, unsigned long flags, int pc) { - __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); } EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) { - __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); } EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) { - ring_buffer_discard_commit(global_trace.buffer, event); + ring_buffer_discard_commit(buffer, event); } EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); @@ -927,14 +1196,15 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; - event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), + event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) return; @@ -942,57 +1212,9 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_entry; - struct ring_buffer_event *event; - struct ftrace_graph_ent_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return 0; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, - sizeof(*entry), flags, pc); - if (!event) - return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); - - return 1; -} - -static void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned long flags, - int pc) -{ - struct ftrace_event_call *call = &event_funcgraph_exit; - struct ring_buffer_event *event; - struct ftrace_graph_ret_entry *entry; - - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; - - event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->ret = *trace; - if (!filter_current_check_discard(call, entry, event)) - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } -#endif void ftrace(struct trace_array *tr, struct trace_array_cpu *data, @@ -1003,17 +1225,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, ip, parent_ip, flags, pc); } -static void __ftrace_trace_stack(struct trace_array *tr, +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; - event = trace_buffer_lock_reserve(tr, TRACE_STACK, + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry), flags, pc); if (!event) return; @@ -1026,32 +1248,46 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); -#endif + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); } -static void ftrace_trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(tr, flags, skip, pc); + __ftrace_trace_stack(buffer, flags, skip, pc); } -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc) +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) { - __ftrace_trace_stack(tr, flags, skip, pc); + __ftrace_trace_stack(tr->buffer, flags, skip, pc); } -static void ftrace_trace_userstack(struct trace_array *tr, - unsigned long flags, int pc) +/** + * trace_dump_stack - record a stack back trace in the trace buffer + */ +void trace_dump_stack(void) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return; + + local_save_flags(flags); + + /* skipping 3 traces, seems to get us at the caller of this function */ + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +} + +static DEFINE_PER_CPU(int, user_stack_count); + +void +ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { -#ifdef CONFIG_STACKTRACE struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -1060,12 +1296,30 @@ static void ftrace_trace_userstack(struct trace_array *tr, if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; - event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) - return; + goto out_drop_count; entry = ring_buffer_event_data(event); + entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; @@ -1074,9 +1328,13 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); -#endif + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + + out_drop_count: + __this_cpu_dec(user_stack_count); + out: + preempt_enable(); } #ifdef UNUSED @@ -1086,174 +1344,7 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) } #endif /* UNUSED */ -static void -ftrace_trace_special(void *__tr, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - int pc) -{ - struct ring_buffer_event *event; - struct trace_array *tr = __tr; - struct special_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, - sizeof(*entry), 0, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->arg1 = arg1; - entry->arg2 = arg2; - entry->arg3 = arg3; - trace_buffer_unlock_commit(tr, event, 0, pc); -} - -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); -} - -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, flags, pc); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(tr, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); -} - -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - - if (tracing_disabled) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (likely(atomic_inc_return(&data->disabled) == 1)) - ftrace_trace_special(tr, arg1, arg2, arg3, pc); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -int trace_graph_entry(struct ftrace_graph_ent *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int ret; - int cpu; - int pc; - - if (!ftrace_trace_task(current)) - return 0; - - if (!ftrace_graph_addr(trace->func)) - return 0; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else { - ret = 0; - } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); - - atomic_dec(&data->disabled); - local_irq_restore(flags); - - return ret; -} - -void trace_graph_return(struct ftrace_graph_ret *trace) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - if (!trace->depth) - clear_tsk_trace_graph(current); - atomic_dec(&data->disabled); - local_irq_restore(flags); -} -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - +#endif /* CONFIG_STACKTRACE */ /** * trace_vbprintk - write binary msg to tracing buffer @@ -1261,18 +1352,18 @@ void trace_graph_return(struct ftrace_graph_ret *trace) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; + struct ring_buffer *buffer; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; int disable; - int resched; int cpu, len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1282,7 +1373,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -1292,14 +1383,16 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Lockdep uses trace_printk for lock tracing */ local_irq_save(flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); if (len > TRACE_BUF_SIZE || len < 0) goto out_unlock; size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1307,30 +1400,48 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) { + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); local_irq_restore(flags); out: atomic_dec_return(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); unpause_graph_tracing(); return len; } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) { - static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - struct trace_array *tr = &global_trace; + struct ring_buffer *buffer; struct trace_array_cpu *data; int cpu, len = 0, size, pc; struct print_entry *entry; @@ -1351,26 +1462,27 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); raw_local_irq_save(irq_flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - len = min(len, TRACE_BUF_SIZE-1); - trace_buf[len] = 0; - size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - entry->ip = ip; + entry->ip = ip; memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = 0; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + entry->buf[len] = '\0'; + if (!filter_check_discard(call, entry, buffer, event)) { + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 6, pc); + } out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: @@ -1379,12 +1491,12 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) return len; } -EXPORT_SYMBOL_GPL(trace_vprintk); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_vprintk(&global_trace, ip, fmt, args); +} +EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { @@ -1399,7 +1511,8 @@ static void trace_iterator_increment(struct trace_iterator *iter) } static struct trace_entry * -peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; @@ -1410,7 +1523,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts); + event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + lost_events); ftrace_enable_cpu(); @@ -1418,10 +1532,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) } static struct trace_entry * -__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, + unsigned long *missing_events, u64 *ent_ts) { struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; @@ -1434,7 +1550,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (cpu_file > TRACE_PIPE_ALL_CPU) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; - ent = peek_next_entry(iter, cpu_file, ent_ts); + ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; @@ -1446,7 +1562,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (ring_buffer_empty_cpu(buffer, cpu)) continue; - ent = peek_next_entry(iter, cpu, &ts); + ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: @@ -1455,6 +1571,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) next = ent; next_cpu = cpu; next_ts = ts; + next_lost = lost_events; } } @@ -1464,6 +1581,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (ent_ts) *ent_ts = next_ts; + if (missing_events) + *missing_events = next_lost; + return next; } @@ -1471,13 +1591,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, ent_ts); + return __find_next_entry(iter, ent_cpu, NULL, ent_ts); } /* Find the next real entry, and increment the iterator to the next entry */ -static void *find_next_entry_inc(struct trace_iterator *iter) +void *trace_find_next_entry_inc(struct trace_iterator *iter) { - iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); + iter->ent = __find_next_entry(iter, &iter->cpu, + &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); @@ -1489,7 +1610,8 @@ static void trace_consume(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ ftrace_disable_cpu(); - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + &iter->lost_events); ftrace_enable_cpu(); } @@ -1499,6 +1621,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) int i = (int)*pos; void *ent; + WARN_ON_ONCE(iter->leftover); + (*pos)++; /* can't go backwards */ @@ -1506,25 +1630,50 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return NULL; if (iter->idx < 0) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } +void tracing_iter_reset(struct trace_iterator *iter, int cpu) +{ + struct trace_array *tr = iter->tr; + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter; + unsigned long entries = 0; + u64 ts; + + tr->data[cpu]->skipped_entries = 0; + + if (!iter->buffer_iter[cpu]) + return; + + buf_iter = iter->buffer_iter[cpu]; + ring_buffer_iter_reset(buf_iter); + + /* + * We could have the case with the max latency tracers + * that a reset never took place on a cpu. This is evident + * by the timestamp being before the start of the buffer. + */ + while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + if (ts >= iter->tr->time_start) + break; + entries++; + ring_buffer_read(buf_iter, NULL); + } + + tr->data[cpu]->skipped_entries = entries; +} + /* - * No necessary locking here. The worst thing which can - * happen is loosing events consumed at the same time - * by a trace_pipe reader. - * Other than that, we don't risk to crash the ring buffer - * because it serializes the readers. - * * The current tracer is copied to avoid a global locking * all around. */ @@ -1556,28 +1705,40 @@ static void *s_start(struct seq_file *m, loff_t *pos) if (cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) - ring_buffer_iter_reset(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); } else - ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); - + tracing_iter_reset(iter, cpu_file); ftrace_enable_cpu(); + iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; } else { - l = *pos - 1; - p = s_next(m, p, &l); + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } } trace_event_read_lock(); + trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { + struct trace_iterator *iter = m->private; + atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -1588,10 +1749,10 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| /_--=> lock-depth \n"); + seq_puts(m, "# |||||/ delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / |||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -1601,23 +1762,39 @@ static void print_func_help_header(struct seq_file *m) } -static void +void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long total; - unsigned long entries; + unsigned long entries = 0; + unsigned long total = 0; + unsigned long count; const char *name = "preemption"; + int cpu; if (type) name = type->name; - entries = ring_buffer_entries(iter->tr->buffer); - total = entries + - ring_buffer_overruns(iter->tr->buffer); + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + total += count; + } else + total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + entries += count; + } seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1659,7 +1836,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); - seq_puts(m, "#\n"); + seq_puts(m, "\n#\n"); } seq_puts(m, "#\n"); @@ -1678,6 +1855,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; + if (iter->tr->data[iter->cpu]->skipped_entries) + return; + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -1710,7 +1890,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) } if (event) - return event->trace(iter, sym_flags); + return event->funcs->trace(iter, sym_flags, event); if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) goto partial; @@ -1736,7 +1916,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event) - return event->raw(iter, 0); + return event->funcs->raw(iter, 0, event); if (!trace_seq_printf(s, "%d ?\n", entry->type)) goto partial; @@ -1763,7 +1943,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event) { - enum print_line_t ret = event->hex(iter, 0); + enum print_line_t ret = event->funcs->hex(iter, 0, event); if (ret != TRACE_TYPE_HANDLED) return ret; } @@ -1788,10 +1968,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; + return event ? event->funcs->binary(iter, 0, event) : + TRACE_TYPE_HANDLED; } -static int trace_empty(struct trace_iterator *iter) +int trace_empty(struct trace_iterator *iter) { int cpu; @@ -1822,10 +2003,14 @@ static int trace_empty(struct trace_iterator *iter) } /* Called with trace_event_read_lock() held. */ -static enum print_line_t print_trace_line(struct trace_iterator *iter) +enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); if (ret != TRACE_TYPE_UNHANDLED) @@ -1854,9 +2039,27 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_default_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; + int ret; if (iter->ent == NULL) { if (iter->tr) { @@ -1865,26 +2068,36 @@ static int s_show(struct seq_file *m, void *v) } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); - else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return 0; - print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_lat_help_header(m); - } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); - } + else + trace_default_header(m); + + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + } else { print_trace_line(iter); - trace_print_seq(m, &iter->seq); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; } return 0; } -static struct seq_operations tracer_seq_ops = { +static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, @@ -1919,11 +2132,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -1940,19 +2151,28 @@ __tracing_open(struct inode *inode, struct file *file) if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; + /* stop the trace while dumping */ + tracing_stop(); + if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { - iter->buffer_iter[cpu] = - ring_buffer_read_start(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->tr->buffer, cpu); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_start(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare_sync(); + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); } - /* TODO stop tracer */ ret = seq_open(file, &tracer_seq_ops); if (ret < 0) { fail_ret = ERR_PTR(ret); @@ -1962,9 +2182,6 @@ __tracing_open(struct inode *inode, struct file *file) m = file->private_data; m->private = iter; - /* stop the trace while dumping */ - tracing_stop(); - mutex_unlock(&trace_types_lock); return iter; @@ -1975,6 +2192,7 @@ __tracing_open(struct inode *inode, struct file *file) ring_buffer_read_finish(iter->buffer_iter[cpu]); } free_cpumask_var(iter->started); + tracing_start(); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); @@ -1994,7 +2212,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; @@ -2031,7 +2249,7 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) { + (file->f_flags & O_TRUNC)) { long cpu = (long) inode->i_private; if (cpu == TRACE_PIPE_ALL_CPU) @@ -2053,25 +2271,23 @@ static int tracing_open(struct inode *inode, struct file *file) static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t = v; (*pos)++; if (t) t = t->next; - m->private = t; - return t; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (; t && l < *pos; t = t_next(m, t, &l)) + for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) ; return t; @@ -2098,7 +2314,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_traces_seq_ops = { +static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -2107,18 +2323,10 @@ static struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { - int ret; - if (tracing_disabled) return -ENODEV; - ret = seq_open(file, &show_traces_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = trace_types; - } - - return ret; + return seq_open(file, &show_traces_seq_ops); } static ssize_t @@ -2128,11 +2336,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } +static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +{ + if (file->f_mode & FMODE_READ) + return seq_lseek(file, offset, origin); + else + return 0; +} + static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = seq_lseek, + .llseek = tracing_seek, .release = tracing_release, }; @@ -2140,6 +2356,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .release = seq_release, + .llseek = seq_lseek, }; /* @@ -2198,7 +2415,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -2213,7 +2430,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, atomic_dec(&global_trace.data[cpu]->disabled); } } - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_enable(); cpumask_copy(tracing_cpumask, tracing_cpumask_new); @@ -2233,103 +2450,70 @@ static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .llseek = generic_file_llseek, }; -static ssize_t -tracing_trace_options_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; u32 tracer_flags; - int len = 0; - char *buf; - int r = 0; int i; - - /* calculate max size */ - for (i = 0; trace_options[i]; i++) { - len += strlen(trace_options[i]); - len += 3; /* "no" and newline */ - } - mutex_lock(&trace_types_lock); tracer_flags = current_trace->flags->val; trace_opts = current_trace->flags->opts; - /* - * Increase the size with names of options specific - * of the current tracer. - */ - for (i = 0; trace_opts[i].name; i++) { - len += strlen(trace_opts[i].name); - len += 3; /* "no" and newline */ - } - - /* +2 for \n and \0 */ - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) { - mutex_unlock(&trace_types_lock); - return -ENOMEM; - } - for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) - r += sprintf(buf + r, "%s\n", trace_options[i]); + seq_printf(m, "%s\n", trace_options[i]); else - r += sprintf(buf + r, "no%s\n", trace_options[i]); + seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) - r += sprintf(buf + r, "%s\n", - trace_opts[i].name); + seq_printf(m, "%s\n", trace_opts[i].name); else - r += sprintf(buf + r, "no%s\n", - trace_opts[i].name); + seq_printf(m, "no%s\n", trace_opts[i].name); } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 2); + return 0; +} - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +static int __set_tracer_option(struct tracer *trace, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + int ret; - kfree(buf); - return r; + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; } /* Try to assign a tracer specific option */ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { - struct tracer_flags *trace_flags = trace->flags; + struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; - int ret = 0, i = 0; - int len; + int i; - for (i = 0; trace_flags->opts[i].name; i++) { - opts = &trace_flags->opts[i]; - len = strlen(opts->name); + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; - if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(trace_flags->val, - opts->bit, !neg); - break; - } + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(trace, trace->flags, + opts, neg); } - /* Not found */ - if (!trace_flags->opts[i].name) - return -EINVAL; - - /* Refused to handle */ - if (ret) - return ret; - if (neg) - trace_flags->val &= ~opts->bit; - else - trace_flags->val |= opts->bit; - - return 0; + return -EINVAL; } static void set_tracer_flags(unsigned int mask, int enabled) @@ -2343,21 +2527,8 @@ static void set_tracer_flags(unsigned int mask, int enabled) else trace_flags &= ~mask; - if (mask == TRACE_ITER_GLOBAL_CLK) { - u64 (*func)(void); - - if (enabled) - func = trace_clock_global; - else - func = trace_clock_local; - - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, func); - - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, func); - mutex_unlock(&trace_types_lock); - } + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); } static ssize_t @@ -2365,7 +2536,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int ret; int i; @@ -2377,16 +2548,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); - if (strncmp(buf, "no", 2) == 0) { + if (strncmp(cmp, "no", 2) == 0) { neg = 1; cmp += 2; } for (i = 0; trace_options[i]; i++) { - int len = strlen(trace_options[i]); - - if (strncmp(cmp, trace_options[i], len) == 0) { + if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); break; } @@ -2401,14 +2571,23 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_trace_options_show, NULL); +} + static const struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_trace_options_read, + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_trace_options_write, }; @@ -2441,6 +2620,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2491,6 +2671,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, static const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2543,7 +2724,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2552,7 +2733,7 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+2]; + char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); @@ -2586,6 +2767,9 @@ static int tracing_resize_ring_buffer(unsigned long size) if (ret < 0) return ret; + if (!current_trace->use_max_tr) + goto out; + ret = ring_buffer_resize(max_tr.buffer, size); if (ret < 0) { int r; @@ -2613,11 +2797,14 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } + max_tr.entries = size; + out: global_trace.entries = size; return ret; } + /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @@ -2678,12 +2865,26 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - + if (current_trace && current_trace->use_max_tr) { + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(max_tr.buffer, 1); + max_tr.entries = 1; + } destroy_trace_option_files(topts); current_trace = t; topts = create_trace_option_files(current_trace); + if (current_trace->use_max_tr) { + ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); + if (ret < 0) + goto out; + max_tr.entries = global_trace.entries; + } if (t->init) { ret = tracer_init(t, tr); @@ -2702,15 +2903,15 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[max_tracer_type_len+1]; + char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; int err; ret = cnt; - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2725,7 +2926,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -2782,22 +2983,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - /* We only allow one reader per cpu */ - if (cpu_file == TRACE_PIPE_ALL_CPU) { - if (!cpumask_empty(tracing_reader_cpumask)) { - ret = -EBUSY; - goto out; - } - cpumask_setall(tracing_reader_cpumask); - } else { - if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) - cpumask_set_cpu(cpu_file, tracing_reader_cpumask); - else { - ret = -EBUSY; - goto out; - } - } - /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { @@ -2836,6 +3021,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (iter->trace->pipe_open) iter->trace->pipe_open(iter); + nonseekable_open(inode, filp); out: mutex_unlock(&trace_types_lock); return ret; @@ -2853,10 +3039,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) - cpumask_clear(tracing_reader_cpumask); - else - cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); mutex_unlock(&trace_types_lock); @@ -3016,7 +3200,8 @@ waitagain: iter->pos = -1; trace_event_read_lock(); - while (find_next_entry_inc(iter) != NULL) { + trace_access_lock(iter->cpu_file); + while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3032,6 +3217,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ @@ -3064,7 +3250,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static struct pipe_buf_operations tracing_pipe_buf_ops = { +static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3095,9 +3281,10 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) break; } - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); rem -= count; - if (!find_next_entry_inc(iter)) { + if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; @@ -3113,12 +3300,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, size_t len, unsigned int flags) { - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .flags = flags, .ops = &tracing_pipe_buf_ops, @@ -3129,6 +3316,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, size_t rem; unsigned int i; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); if (unlikely(old_tracer != current_trace && current_trace)) { @@ -3150,46 +3340,50 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (ret <= 0) goto out_err; - if (!iter->ent && !find_next_entry_inc(iter)) { + if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } trace_event_read_lock(); + trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { - pages[i] = alloc_page(GFP_KERNEL); - if (!pages[i]) + for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + spd.pages[i] = alloc_page(GFP_KERNEL); + if (!spd.pages[i]) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, - page_address(pages[i]), + page_address(spd.pages[i]), iter->seq.len); if (ret < 0) { - __free_page(pages[i]); + __free_page(spd.pages[i]); break; } - partial[i].offset = 0; - partial[i].len = iter->seq.len; + spd.partial[i].offset = 0; + spd.partial[i].len = iter->seq.len; trace_seq_init(&iter->seq); } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; - return splice_to_pipe(pipe, &spd); + ret = splice_to_pipe(pipe, &spd); +out: + splice_shrink_spd(pipe, &spd); + return ret; out_err: mutex_unlock(&iter->mutex); - - return ret; + goto out; } static ssize_t @@ -3259,7 +3453,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) @@ -3273,7 +3467,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } tracing_start(); - max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); return cnt; @@ -3294,7 +3487,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; - char *end; + size_t written; if (tracing_disabled) return -EINVAL; @@ -3302,7 +3495,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 1, GFP_KERNEL); + buf = kmalloc(cnt + 2, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -3310,36 +3503,102 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, kfree(buf); return -EFAULT; } + if (buf[cnt-1] != '\n') { + buf[cnt] = '\n'; + buf[cnt+1] = '\0'; + } else + buf[cnt] = '\0'; - /* Cut from the first nil or newline. */ - buf[cnt] = '\0'; - end = strchr(buf, '\n'); - if (end) - *end = '\0'; - - cnt = mark_printk("%s\n", buf); + written = mark_printk("%s", buf); kfree(buf); + *fpos += written; + + /* don't tell userspace we wrote more - it might confuse them */ + if (written > cnt) + written = cnt; + + return written; +} + +static int tracing_clock_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + seq_printf(m, + "%s%s%s%s", i ? " " : "", + i == trace_clock_id ? "[" : "", trace_clocks[i].name, + i == trace_clock_id ? "]" : ""); + seq_putc(m, '\n'); + + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char buf[64]; + const char *clockstr; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + trace_clock_id = i; + + mutex_lock(&trace_types_lock); + + ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + + mutex_unlock(&trace_types_lock); + *fpos += cnt; return cnt; } +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_clock_show, NULL); +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, .write = tracing_ctrl_write, + .llseek = generic_file_llseek, }; static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, .write = tracing_set_trace_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_pipe_fops = { @@ -3348,17 +3607,28 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, + .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations trace_clock_fops = { + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = tracing_clock_write, }; struct ftrace_buffer_info { @@ -3396,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; - unsigned int pos; ssize_t ret; size_t size; @@ -3414,18 +3683,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; + trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, count, info->cpu, 0); + trace_access_unlock(info->cpu); if (ret < 0) return 0; - pos = ring_buffer_page_len(info->spare); - - if (pos < PAGE_SIZE) - memset(info->spare + pos, 0, PAGE_SIZE - pos); - read: size = PAGE_SIZE - info->read; if (size > count) @@ -3487,7 +3753,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, } /* Pipe buffer operations for a buffer. */ -static struct pipe_buf_operations buffer_pipe_buf_ops = { +static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3520,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; - struct partial_page partial[PIPE_BUFFERS]; - struct page *pages[PIPE_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -3533,21 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int entries, size, i; size_t ret; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + if (*ppos & (PAGE_SIZE - 1)) { WARN_ONCE(1, "Ftrace: previous read must page-align\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (len & (PAGE_SIZE - 1)) { WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); - if (len < PAGE_SIZE) - return -EINVAL; + if (len < PAGE_SIZE) { + ret = -EINVAL; + goto out; + } len &= PAGE_MASK; } + trace_access_lock(info->cpu); entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); - for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3592,6 +3865,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } + trace_access_unlock(info->cpu); spd.nr_pages = i; /* did we read anything? */ @@ -3601,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, else ret = 0; /* TODO: block */ - return ret; + goto out; } ret = splice_to_pipe(pipe, &spd); - + splice_shrink_spd(pipe, &spd); +out: return ret; } @@ -3628,7 +3903,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) - return ENOMEM; + return -ENOMEM; trace_seq_init(s); @@ -3641,9 +3916,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); - trace_seq_printf(s, "nmi dropped: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); @@ -3654,6 +3926,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic, .read = tracing_stats_read, + .llseek = generic_file_llseek, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -3690,6 +3963,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, }; #endif @@ -3746,13 +4020,9 @@ static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); struct dentry *d_cpu; - /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ - char cpu_dir[7]; - - if (cpu > 999 || cpu < 0) - return; + char cpu_dir[30]; /* 30 characters should be more than enough */ - sprintf(cpu_dir, "cpu%ld", cpu); + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); @@ -3821,39 +4091,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - ret = 0; - switch (val) { - case 0: - /* do nothing if already cleared */ - if (!(topt->flags->val & topt->opt->bit)) - break; - - mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 0); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - topt->flags->val &= ~topt->opt->bit; - break; - case 1: - /* do nothing if already set */ - if (topt->flags->val & topt->opt->bit) - break; + if (val != 0 && val != 1) + return -EINVAL; + if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 1); + ret = __set_tracer_option(current_trace, topt->flags, + topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; - topt->flags->val |= topt->opt->bit; - break; - - default: - return -EINVAL; } *ppos += cnt; @@ -3866,6 +4113,7 @@ static const struct file_operations trace_options_fops = { .open = tracing_open_generic, .read = trace_options_read, .write = trace_options_write, + .llseek = generic_file_llseek, }; static ssize_t @@ -3904,17 +4152,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - trace_flags &= ~(1 << index); - break; - case 1: - trace_flags |= 1 << index; - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } + set_tracer_flags(1 << index, val); *ppos += cnt; @@ -3925,6 +4165,7 @@ static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, + .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, @@ -4062,6 +4303,8 @@ static __init int tracer_init_debugfs(void) struct dentry *d_tracer; int cpu; + trace_access_lock_init(); + d_tracer = tracing_init_dentry(); trace_create_file("tracing_enabled", 0644, d_tracer, @@ -4082,8 +4325,10 @@ static __init int tracer_init_debugfs(void) trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); +#ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); +#endif trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); @@ -4103,13 +4348,13 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("trace_clock", 0644, d_tracer, NULL, + &trace_clock_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_SYSPROF_TRACER - init_tracer_sysprof_debugfs(d_tracer); -#endif create_trace_options_dir(); @@ -4123,7 +4368,7 @@ static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { if (ftrace_dump_on_oops) - ftrace_dump(); + ftrace_dump(ftrace_dump_on_oops); return NOTIFY_OK; } @@ -4140,7 +4385,7 @@ static int trace_die_handler(struct notifier_block *self, switch (val) { case DIE_OOPS: if (ftrace_dump_on_oops) - ftrace_dump(); + ftrace_dump(ftrace_dump_on_oops); break; default: break; @@ -4166,7 +4411,7 @@ static struct notifier_block trace_die_notifier = { */ #define KERN_TRACE KERN_EMERG -static void +void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ @@ -4181,10 +4426,18 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -static void __ftrace_dump(bool disable_tracing) +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = current_trace; + iter->cpu_file = TRACE_PIPE_ALL_CPU; +} + +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { - static raw_spinlock_t ftrace_dump_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t ftrace_dump_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4194,7 +4447,7 @@ static void __ftrace_dump(bool disable_tracing) /* only one dump */ local_irq_save(flags); - __raw_spin_lock(&ftrace_dump_lock); + arch_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4205,8 +4458,10 @@ static void __ftrace_dump(bool disable_tracing) if (disable_tracing) ftrace_kill(); + trace_init_global_iter(&iter); + for_each_tracing_cpu(cpu) { - atomic_inc(&global_trace.data[cpu]->disabled); + atomic_inc(&iter.tr->data[cpu]->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4214,12 +4469,25 @@ static void __ftrace_dump(bool disable_tracing) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - printk(KERN_TRACE "Dumping ftrace buffer:\n"); - /* Simulate the iterator */ iter.tr = &global_trace; iter.trace = current_trace; - iter.cpu_file = TRACE_PIPE_ALL_CPU; + + switch (oops_dump_mode) { + case DUMP_ALL: + iter.cpu_file = TRACE_PIPE_ALL_CPU; + break; + case DUMP_ORIG: + iter.cpu_file = raw_smp_processor_id(); + break; + case DUMP_NONE: + goto out_enable; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + iter.cpu_file = TRACE_PIPE_ALL_CPU; + } + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); /* * We need to stop all tracing on all CPUS to read the @@ -4242,9 +4510,12 @@ static void __ftrace_dump(bool disable_tracing) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (find_next_entry_inc(&iter) != NULL) { - print_trace_line(&iter); - trace_consume(&iter); + if (trace_find_next_entry_inc(&iter) != NULL) { + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); } trace_printk_seq(&iter.seq); @@ -4255,30 +4526,30 @@ static void __ftrace_dump(bool disable_tracing) else printk(KERN_TRACE "---------------------------------\n"); + out_enable: /* Re-enable tracing if requested */ if (!disable_tracing) { trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&global_trace.data[cpu]->disabled); + atomic_dec(&iter.tr->data[cpu]->disabled); } tracing_on(); } out: - __raw_spin_unlock(&ftrace_dump_lock); + arch_spin_unlock(&ftrace_dump_lock); local_irq_restore(flags); } /* By default: disable tracing after the dump */ -void ftrace_dump(void) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - __ftrace_dump(true); + __ftrace_dump(true, oops_dump_mode); } __init static int tracer_alloc_buffers(void) { - struct trace_array_cpu *data; int ring_buf_size; int i; int ret = -ENOMEM; @@ -4289,9 +4560,6 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) - goto out_free_tracing_cpumask; - /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; @@ -4300,7 +4568,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, @@ -4314,31 +4581,26 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = ring_buffer_size(max_tr.buffer); - WARN_ON(max_tr.entries != global_trace.entries); + max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); + global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_tr_data, i); } trace_init_cmdlines(); register_tracer(&nop_trace); current_trace = &nop_trace; -#ifdef CONFIG_BOOT_TRACER - register_tracer(&boot_tracer); -#endif /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -4350,8 +4612,6 @@ __init static int tracer_alloc_buffers(void) return 0; out_free_cpumask: - free_cpumask_var(tracing_reader_cpumask); -out_free_tracing_cpumask: free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4771f..9021f8c0c0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,11 +7,9 @@ #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/tracepoint.h> #include <linux/ftrace.h> -#include <trace/boot.h> -#include <linux/kmemtrace.h> -#include <trace/power.h> - +#include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -24,177 +22,58 @@ enum trace_type { TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, - TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, - TRACE_BOOT_CALL, - TRACE_BOOT_RET, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_HW_BRANCHES, - TRACE_SYSCALL_ENTER, - TRACE_SYSCALL_EXIT, - TRACE_KMEM_ALLOC, - TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, __TRACE_LAST_TYPE, }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { - struct trace_entry ent; - struct ftrace_graph_ent graph_ent; -}; - -/* Function return entry */ -struct ftrace_graph_ret_entry { - struct trace_entry ent; - struct ftrace_graph_ret ret; -}; -extern struct tracer boot_tracer; - -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { - struct trace_entry ent; - unsigned int prev_pid; - unsigned char prev_prio; - unsigned char prev_state; - unsigned int next_pid; - unsigned char next_prio; - unsigned char next_state; - unsigned int next_cpu; -}; - -/* - * Special (free-form) trace entry: - */ -struct special_entry { - struct trace_entry ent; - unsigned long arg1; - unsigned long arg2; - unsigned long arg3; -}; - -/* - * Stack-trace entry: - */ - -#define FTRACE_STACK_ENTRIES 8 - -struct stack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -struct userstack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -/* - * trace_printk entry: - */ -struct bprint_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; -}; - -struct print_entry { - struct trace_entry ent; - unsigned long ip; - char buf[]; -}; - -#define TRACE_OLD_SIZE 88 -struct trace_field_cont { - unsigned char type; - /* Temporary till we get rid of this completely */ - char buf[TRACE_OLD_SIZE - 1]; -}; +#undef __field +#define __field(type, item) type item; -struct trace_mmiotrace_rw { - struct trace_entry ent; - struct mmiotrace_rw rw; -}; +#undef __field_struct +#define __field_struct(type, item) __field(type, item) -struct trace_mmiotrace_map { - struct trace_entry ent; - struct mmiotrace_map map; -}; +#undef __field_desc +#define __field_desc(type, container, item) -struct trace_boot_call { - struct trace_entry ent; - struct boot_trace_call boot_call; -}; +#undef __array +#define __array(type, item, size) type item[size]; -struct trace_boot_ret { - struct trace_entry ent; - struct boot_trace_ret boot_ret; -}; +#undef __array_desc +#define __array_desc(type, container, item, size) -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { - struct trace_entry ent; - unsigned line; - char func[TRACE_FUNC_SIZE+1]; - char file[TRACE_FILE_SIZE+1]; - char correct; -}; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -struct hw_branch_entry { - struct trace_entry ent; - u64 from; - u64 to; -}; +#undef F_STRUCT +#define F_STRUCT(args...) args -struct trace_power { - struct trace_entry ent; - struct power_trace state_data; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; +#undef TP_ARGS +#define TP_ARGS(args...) args -struct kmemtrace_alloc_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct kmemtrace_free_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ struct syscall_trace_enter { struct trace_entry ent; int nr; @@ -204,16 +83,26 @@ struct syscall_trace_enter { struct syscall_trace_exit { struct trace_entry ent; int nr; - unsigned long ret; + long ret; }; +struct kprobe_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct kretprobe_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCED - reschedule is requested + * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ @@ -236,9 +125,6 @@ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ - /* these fields get copied into max-trace: */ - unsigned long trace_idx; - unsigned long overrun; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -246,6 +132,7 @@ struct trace_array_cpu { unsigned long nice; unsigned long policy; unsigned long rt_priority; + unsigned long skipped_entries; cycle_t preempt_timestamp; pid_t pid; uid_t uid; @@ -301,28 +188,15 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ - IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ - IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ - IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ - TRACE_KMEM_ALLOC); \ - IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ - TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct syscall_trace_enter, \ - TRACE_SYSCALL_ENTER); \ - IF_ASSIGN(var, ent, struct syscall_trace_exit, \ - TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) @@ -360,6 +234,7 @@ struct tracer_flags { * @pipe_open: called when the trace_pipe file is opened * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) @@ -378,6 +253,7 @@ struct tracer { void (*pipe_open)(struct trace_iterator *iter); void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); @@ -398,7 +274,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - struct tracer_stat *stats; + int use_max_tr; }; @@ -419,16 +295,16 @@ struct dentry *trace_create_file(const char *name, const struct file_operations *fops); struct dentry *tracing_init_dentry(void); -void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; -struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - int type, - unsigned long len, - unsigned long flags, - int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, + int pc); +void trace_buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc); @@ -438,9 +314,13 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); @@ -459,18 +339,21 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *cur, unsigned long flags, int pc); -void trace_special(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); +void trace_default_header(struct seq_file *m); +void print_trace_header(struct seq_file *m, struct trace_iterator *iter); +int trace_empty(struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -479,35 +362,56 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, +}; + +extern cpumask_var_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); -extern unsigned long tracing_max_latency; extern unsigned long tracing_thresh; +#ifdef CONFIG_TRACER_MAX_TRACE +extern unsigned long tracing_max_latency; + void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); +#endif /* CONFIG_TRACER_MAX_TRACE */ -void __trace_stack(struct trace_array *tr, - unsigned long flags, - int skip, int pc); +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc); -extern cycle_t ftrace_now(int cpu); +void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, + int pc); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, - void *__rq, - struct task_struct *prev, - struct task_struct *next); - -struct tracer_switch_ops { - tracer_switch_func_t func; - void *private; - struct tracer_switch_ops *next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ + +extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); @@ -517,6 +421,10 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +extern int ring_buffer_expanded; +extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(int, ftrace_cpu_disabled); + #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); @@ -534,12 +442,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sysprof(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -548,18 +452,48 @@ extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; +extern int trace_clock_id; + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -extern enum print_line_t print_graph_function(struct trace_iterator *iter); + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 + +extern enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags); +extern void print_graph_headers_flags(struct seq_file *s, u32 flags); extern enum print_line_t trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); +extern void graph_trace_open(struct trace_iterator *iter); +extern void graph_trace_close(struct trace_iterator *iter); +extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, int pc); +extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, int pc); + #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; @@ -567,7 +501,7 @@ static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_count || test_tsk_trace_graph(current)) + if (!ftrace_graph_filter_enabled) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -578,10 +512,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; @@ -589,21 +519,63 @@ static inline int ftrace_graph_addr(unsigned long addr) #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t -print_graph_function(struct trace_iterator *iter) +print_graph_function_flags(struct trace_iterator *iter, u32 flags) { return TRACE_TYPE_UNHANDLED; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; +#ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); } +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +#endif + +/* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input length + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); /* * trace_iterator_flags is an enumeration that defines bit @@ -622,19 +594,18 @@ enum trace_iterator_flags { TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_SCHED_TREE = 0x200, - TRACE_ITER_PRINTK = 0x400, - TRACE_ITER_PREEMPTONLY = 0x800, - TRACE_ITER_BRANCH = 0x1000, - TRACE_ITER_ANNOTATE = 0x2000, - TRACE_ITER_USERSTACKTRACE = 0x4000, - TRACE_ITER_SYM_USEROBJ = 0x8000, - TRACE_ITER_PRINTK_MSGONLY = 0x10000, - TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_GLOBAL_CLK = 0x80000, - TRACE_ITER_SLEEP_TIME = 0x100000, - TRACE_ITER_GRAPH_TIME = 0x200000, + TRACE_ITER_PRINTK = 0x200, + TRACE_ITER_PREEMPTONLY = 0x400, + TRACE_ITER_BRANCH = 0x800, + TRACE_ITER_ANNOTATE = 0x1000, + TRACE_ITER_USERSTACKTRACE = 0x2000, + TRACE_ITER_SYM_USEROBJ = 0x4000, + TRACE_ITER_PRINTK_MSGONLY = 0x8000, + TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x20000, + TRACE_ITER_SLEEP_TIME = 0x40000, + TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, }; /* @@ -646,54 +617,6 @@ enum trace_iterator_flags { extern struct tracer nop_trace; -/** - * ftrace_preempt_disable - disable preemption scheduler safe - * - * When tracing can happen inside the scheduler, there exists - * cases that the tracing might happen before the need_resched - * flag is checked. If this happens and the tracer calls - * preempt_enable (after a disable), a schedule might take place - * causing an infinite recursion. - * - * To prevent this, we read the need_resched flag before - * disabling preemption. When we want to enable preemption we - * check the flag, if it is set, then we call preempt_enable_no_resched. - * Otherwise, we call preempt_enable. - * - * The rational for doing the above is that if need_resched is set - * and we have yet to reschedule, we are either in an atomic location - * (where we do not need to check for scheduling) or we are inside - * the scheduler and do not want to resched. - */ -static inline int ftrace_preempt_disable(void) -{ - int resched; - - resched = need_resched(); - preempt_disable_notrace(); - - return resched; -} - -/** - * ftrace_preempt_enable - enable preemption scheduler safe - * @resched: the return value from ftrace_preempt_disable - * - * This is a scheduler safe way to enable preemption and not miss - * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we are either inside an atomic or - * are inside the scheduler (we would have already scheduled - * otherwise). In this case, we do not want to call normal - * preempt_enable, but preempt_enable_no_resched instead. - */ -static inline void ftrace_preempt_enable(int resched) -{ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); @@ -731,6 +654,7 @@ struct ftrace_event_field { struct list_head link; char *name; char *type; + int filter_type; int offset; int size; int is_signed; @@ -746,26 +670,47 @@ struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - void *filter; + struct event_filter *filter; + int nr_events; }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; }; +extern struct list_head ftrace_common_fields; + +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, @@ -774,13 +719,18 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); +extern int filter_assign_type(const char *type); + +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } @@ -788,46 +738,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} +extern void trace_event_enable_cmd_record(bool enable); extern struct mutex event_mutex; extern struct list_head ftrace_events; @@ -835,11 +746,13 @@ extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ - extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ + extern struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##call; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index a29ef23ffb4..00000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null @@ -1,179 +0,0 @@ -/* - * ring buffer based initcalls tracer - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/kallsyms.h> -#include <linux/time.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *boot_trace; -static bool pre_initcalls_finished; - -/* Tells the boot tracer that the pre_smp_initcalls are finished. - * So we are ready . - * It doesn't enable sched events tracing however. - * You have to call enable_boot_trace to do so. - */ -void start_boot_trace(void) -{ - pre_initcalls_finished = true; -} - -void enable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_start_sched_switch_record(); -} - -void disable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_stop_sched_switch_record(); -} - -static int boot_trace_init(struct trace_array *tr) -{ - int cpu; - boot_trace = tr; - - if (!tr) - return 0; - - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); - - tracing_sched_switch_assign_trace(tr); - return 0; -} - -static enum print_line_t -initcall_call_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_call *field; - struct boot_trace_call *call; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - call = &field->boot_call; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - (unsigned long)ts, nsec_rem, call->func, call->caller); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -initcall_ret_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_ret *field; - struct boot_trace_ret *init_ret; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - init_ret = &field->boot_ret; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %llu msecs\n", - (unsigned long) ts, - nsec_rem, - init_ret->func, init_ret->result, init_ret->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t initcall_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_BOOT_CALL: - return initcall_call_print_line(iter); - case TRACE_BOOT_RET: - return initcall_ret_print_line(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -struct tracer boot_tracer __read_mostly = -{ - .name = "initcall", - .init = boot_trace_init, - .reset = tracing_reset_online_cpus, - .print_line = initcall_print_line, -}; - -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) -{ - struct ring_buffer_event *event; - struct trace_boot_call *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - /* Get its name now since this function could - * disappear because it is in the .init section. - */ - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_call = *bt; - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} - -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) -{ - struct ring_buffer_event *event; - struct trace_boot_ret *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_ret = *bt; - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd249a..8d3538b4ea5 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; + struct ring_buffer *buffer; unsigned long flags; int cpu, pc; const char *p; @@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; @@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); @@ -141,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr) } static enum print_line_t trace_branch_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct trace_branch *field; @@ -165,9 +167,13 @@ static void branch_print_header(struct seq_file *s) " |\n"); } +static struct trace_event_functions trace_branch_funcs = { + .trace = trace_branch_print, +}; + static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, - .trace = trace_branch_print, + .funcs = &trace_branch_funcs, }; static struct tracer branch_trace __read_mostly = @@ -305,8 +311,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) return -1; if (percent_a > percent_b) return 1; - else - return 0; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; } static struct tracer_stat annotated_branch_stats = { diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd81f7f..685a67d55db 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -13,6 +13,7 @@ * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> +#include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> @@ -20,6 +21,8 @@ #include <linux/ktime.h> #include <linux/trace_clock.h> +#include "trace.h" + /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -28,7 +31,6 @@ */ u64 notrace trace_clock_local(void) { - unsigned long flags; u64 clock; /* @@ -36,9 +38,9 @@ u64 notrace trace_clock_local(void) * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - raw_local_irq_save(flags); + preempt_disable_notrace(); clock = sched_clock(); - raw_local_irq_restore(flags); + preempt_enable_notrace(); return clock; } @@ -53,7 +55,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } @@ -66,10 +68,14 @@ u64 notrace trace_clock(void) * Used by plugins that need globally coherent timestamps. */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + arch_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, + }; u64 notrace trace_clock_global(void) { @@ -77,7 +83,7 @@ u64 notrace trace_clock_global(void) int this_cpu; u64 now; - raw_local_irq_save(flags); + local_irq_save(flags); this_cpu = raw_smp_processor_id(); now = cpu_clock(this_cpu); @@ -88,22 +94,22 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_lock); + arch_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_trace_clock_time+1, to make sure + * my_scd->clock to prev_time+1, to make sure * we start ticking with the local clock from now on? */ - if ((s64)(now - prev_trace_clock_time) < 0) - now = prev_trace_clock_time + 1; + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; - prev_trace_clock_time = now; + trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_lock); + arch_spin_unlock(&trace_clock_struct.lock); out: - raw_local_irq_restore(flags); + local_irq_restore(flags); return now; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 00000000000..e3dfecaf13e --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,276 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is layed out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) \ + __field( unsigned int, next_cpu ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c new file mode 100644 index 00000000000..19a359d5e6d --- /dev/null +++ b/kernel/trace/trace_event_perf.c @@ -0,0 +1,216 @@ +/* + * trace event based perf event profiling/tracing + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> + */ + +#include <linux/module.h> +#include <linux/kprobes.h> +#include "trace.h" + +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; + +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; + +/* Count the events in use (per event id, not per instance) */ +static int total_ref_count; + +static int perf_trace_event_perm(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + struct hlist_head __percpu *list; + int ret; + int cpu; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) + return 0; + + ret = -ENOMEM; + + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; + + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); + + tp_event->perf_events = list; + + if (!total_ref_count) { + char __percpu *buf; + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail; + + perf_trace_buf[i] = buf; + } + } + + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + if (ret) + goto fail; + + total_ref_count++; + return 0; + +fail: + if (!total_ref_count) { + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + } + + return ret; +} + +int perf_trace_init(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event; + int event_id = p_event->attr.config; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->event.type == event_id && + tp_event->class && tp_event->class->reg && + try_module_get(tp_event->mod)) { + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + module_put(tp_event->mod); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +int perf_trace_add(struct perf_event *p_event, int flags) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + + return 0; +} + +void perf_trace_del(struct perf_event *p_event, int flags) +{ + hlist_del_rcu(&p_event->hlist_entry); +} + +void perf_trace_destroy(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + mutex_lock(&event_mutex); + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); + mutex_unlock(&event_mutex); +} + +__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs *regs, int *rctxp) +{ + struct trace_entry *entry; + unsigned long flags; + char *raw_data; + int pc; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + + pc = preempt_count(); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + return NULL; + + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + + /* zero the dead bytes from align to not leak stack to user */ + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + + entry = (struct trace_entry *)raw_data; + local_save_flags(flags); + tracing_generic_entry_update(entry, flags, pc); + entry->type = type; + + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c deleted file mode 100644 index 5b5895afecf..00000000000 --- a/kernel/trace/trace_event_profile.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * trace event based perf counter profiling - * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> - * - */ - -#include "trace.h" - -int ftrace_profile_enable(int event_id) -{ - struct ftrace_event_call *event; - int ret = -EINVAL; - - mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { - ret = event->profile_enable(event); - break; - } - } - mutex_unlock(&event_mutex); - - return ret; -} - -void ftrace_profile_disable(int event_id) -{ - struct ftrace_event_call *event; - - mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { - event->profile_disable(event); - break; - } - } - mutex_unlock(&event_mutex); -} diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index 5e32e375134..00000000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,175 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned long, parent_ip, parent_ip) - ), - TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, - ftrace_graph_ent_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, graph_ent.func, func) - TRACE_FIELD(int, graph_ent.depth, depth) - ), - TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, - ftrace_graph_ret_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ret.func, func) - TRACE_FIELD(int, ret.depth, depth) - ), - TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, arg1, arg1) - TRACE_FIELD(unsigned long, arg2, arg2) - TRACE_FIELD(unsigned long, arg3, arg3) - ), - TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES 8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(char *, fmt, fmt) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, - TRACE_FUNC_SIZE+1, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, - TRACE_FUNC_SIZE+1, file) - TRACE_FIELD(char, correct, correct) - ), - TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(u64, from, from) - TRACE_FIELD(u64, to, to) - ), - TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, - TRACE_STRUCT( - TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) - TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) - TRACE_FIELD(int, state_data.type, type) - TRACE_FIELD(int, state_data.state, state) - ), - TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - TRACE_FIELD(size_t, bytes_req, bytes_req) - TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) - TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) - TRACE_FIELD(int, node, node) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" - " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..35fde09b81d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,18 +15,38 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> +#include <linux/slab.h> #include <linux/delay.h> +#include <asm/setup.h> + #include "trace_output.h" +#undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_storage_mutex); +EXPORT_SYMBOL_GPL(event_storage_mutex); + +char event_storage[EVENT_STORAGE_SIZE]; +EXPORT_SYMBOL_GPL(event_storage); + LIST_HEAD(ftrace_events); +LIST_HEAD(ftrace_common_fields); -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size, int is_signed) +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call) +{ + if (!event_call->class->get_fields) + return &event_call->class->fields; + return event_call->class->get_fields(event_call); +} + +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) { struct ftrace_event_field *field; @@ -42,31 +62,72 @@ int trace_define_field(struct ftrace_event_call *call, char *type, if (!field->type) goto err; + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + field->offset = offset; field->size = size; field->is_signed = is_signed; - list_add(&field->link, &call->fields); + + list_add(&field->link, head); return 0; err: - if (field) { + if (field) kfree(field->name); - kfree(field->type); - } kfree(field); return -ENOMEM; } + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} EXPORT_SYMBOL_GPL(trace_define_field); -#ifdef CONFIG_MODULES +#define __common_field(type, item) \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +static int trace_define_common_fields(void) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + __common_field(int, lock_depth); + + return ret; +} -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; + struct list_head *head; - list_for_each_entry_safe(field, next, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kfree(field->type); kfree(field->name); @@ -74,27 +135,102 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(&call->event); + if (!id) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + +int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->name, + call->class->probe, + call); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->probe, + call); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->name, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->perf_probe, + call); + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } else { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + } + mutex_unlock(&event_mutex); +} -static void ftrace_event_enable_disable(struct ftrace_event_call *call, +static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { + int ret = 0; + switch (enable) { case 0: - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); + if (call->flags & TRACE_EVENT_FL_ENABLED) { + call->flags &= ~TRACE_EVENT_FL_ENABLED; + if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + call->class->reg(call, TRACE_REG_UNREGISTER); } break; case 1: - if (!call->enabled) { - call->enabled = 1; - tracing_start_cmdline_record(); - call->regfunc(); + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } + ret = call->class->reg(call, TRACE_REG_REGISTER); + if (ret) { + tracing_stop_cmdline_record(); + pr_info("event trace: Could not enable event " + "%s\n", call->name); + break; + } + call->flags |= TRACE_EVENT_FL_ENABLED; } break; } + + return ret; } static void ftrace_clear_events(void) @@ -120,15 +256,15 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->name || !call->class || !call->class->reg) continue; if (match && strcmp(match, call->name) != 0 && - strcmp(match, call->system) != 0) + strcmp(match, call->class->system) != 0) continue; - if (sub && strcmp(sub, call->system) != 0) + if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, call->name) != 0) @@ -198,73 +334,38 @@ static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - size_t read = 0; - int i, set = 1; - ssize_t ret; - char *buf; - char ch; + struct trace_parser parser; + ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; ret = tracing_update_buffers(); if (ret < 0) return ret; - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - } - - /* Only white space found? */ - if (isspace(ch)) { - file->f_pos += read; - ret = read; - return ret; - } - - buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); - if (!buf) + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; - if (cnt > EVENT_BUF_SIZE) - cnt = EVENT_BUF_SIZE; + read = trace_get_user(&parser, ubuf, cnt, ppos); - i = 0; - while (cnt && !isspace(ch)) { - if (!i && ch == '!') + if (read >= 0 && trace_parser_loaded((&parser))) { + int set = 1; + + if (*parser.buffer == '!') set = 0; - else - buf[i++] = ch; - ret = get_user(ch, ubuf++); + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); if (ret) - goto out_free; - read++; - cnt--; + goto out_put; } - buf[i] = 0; - - file->f_pos += read; - - ret = ftrace_set_clr_event(buf, set); - if (ret) - goto out_free; ret = read; - out_free: - kfree(buf); + out_put: + trace_parser_put(&parser); return ret; } @@ -272,78 +373,75 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - for (;;) { - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - + list_for_each_entry_continue(call, &ftrace_events, list) { /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->regfunc) - break; - - list = list->next; + if (call->class && call->class->reg) + return call; } - m->private = list->next; - - return call; + return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return t_next(m, NULL, pos); + + call = list_entry(&ftrace_events, struct ftrace_event_call, list); + for (l = 0; l <= *pos; ) { + call = t_next(m, call, &l); + if (!call) + break; + } + return call; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct list_head *list = m->private; - struct ftrace_event_call *call; + struct ftrace_event_call *call = v; (*pos)++; - retry: - if (list == &ftrace_events) - return NULL; - - call = list_entry(list, struct ftrace_event_call, list); - - if (!call->enabled) { - list = list->next; - goto retry; + list_for_each_entry_continue(call, &ftrace_events, list) { + if (call->flags & TRACE_EVENT_FL_ENABLED) + return call; } - m->private = list->next; - - return call; + return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return s_next(m, NULL, pos); + + call = list_entry(&ftrace_events, struct ftrace_event_call, list); + for (l = 0; l <= *pos; ) { + call = s_next(m, call, &l); + if (!call) + break; + } + return call; } static int t_show(struct seq_file *m, void *v) { struct ftrace_event_call *call = v; - if (strcmp(call->system, TRACE_SYSTEM) != 0) - seq_printf(m, "%s:", call->system); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", call->name); return 0; @@ -360,7 +458,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && - !(file->f_flags & O_APPEND)) + (file->f_flags & O_TRUNC)) ftrace_clear_events(); seq_ops = inode->i_private; @@ -374,7 +472,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call = filp->private_data; char *buf; - if (call->enabled) + if (call->flags & TRACE_EVENT_FL_ENABLED) buf = "1\n"; else buf = "0\n"; @@ -411,7 +509,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(call, val); mutex_unlock(&event_mutex); break; @@ -421,7 +519,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return cnt; + return ret ? ret : cnt; } static ssize_t @@ -437,10 +535,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->name || !call->class || !call->class->reg) continue; - if (system && strcmp(call->system, system) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -448,7 +546,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!call->enabled); + set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -506,74 +604,146 @@ out: return ret; } -extern char *__bad_type_size(void); +enum { + FORMAT_HEADER = 1, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); + + (*pos)++; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + if (unlikely(list_empty(common_head))) + return NULL; + + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + + case FORMAT_FIELD_SEPERATOR: + if (unlikely(list_empty(head))) + return NULL; + + field = list_entry(head->prev, struct ftrace_event_field, link); + return field; -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + case FORMAT_PRINTFMT: + /* all done */ + return NULL; + } + + field = v; + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) + return (void *)FORMAT_PRINTFMT; -static int trace_write_header(struct trace_seq *s) + field = list_entry(field->link.prev, struct ftrace_event_field, link); + + return field; +} + +static void *f_start(struct seq_file *m, loff_t *pos) { - struct trace_entry field; - - /* struct trace_entry */ - return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, tgid)); + loff_t l = 0; + void *p; + + /* Start by showing the header */ + if (!*pos) + return (void *)FORMAT_HEADER; + + p = (void *)FORMAT_HEADER; + do { + p = f_next(m, p, &l); + } while (p && l < *pos); + + return p; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +static int f_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - char *buf; - int r; + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + const char *array_descriptor; - if (*ppos) + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "ID: %d\n", call->event.type); + seq_printf(m, "format:\n"); return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; - trace_seq_init(s); + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; + } - /* If any of the first writes fail, so will the show_format. */ + field = v; - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->id); - trace_seq_printf(s, "format:\n"); - trace_write_header(s); + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); - r = call->show_format(s); - if (!r) { - /* - * ug! The format output is bigger than a PAGE!! - */ - buf = "FORMAT TOO BIG\n"; - r = simple_read_from_buffer(ubuf, cnt, ppos, - buf, strlen(buf)); - goto out; - } + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - out: - kfree(s); - return r; + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + + return 0; +} + +static void f_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct ftrace_event_call *call = inode->i_private; + struct seq_file *m; + int ret; + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = call; + + return 0; } static ssize_t @@ -591,7 +761,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return -ENOMEM; trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->id); + trace_seq_printf(s, "%d\n", call->event.type); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); @@ -768,39 +938,47 @@ static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, .write = event_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { - .open = tracing_open_generic, - .read = event_format_read, + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static const struct file_operations ftrace_event_id_fops = { .open = tracing_open_generic, .read = event_id_read, + .llseek = default_llseek, }; static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_generic, .read = event_filter_read, .write = event_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = tracing_open_generic, .read = subsystem_filter_read, .write = subsystem_filter_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_system_enable_fops = { .open = tracing_open_generic, .read = system_enable_read, .write = system_enable_write, + .llseek = default_llseek, }; static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, + .llseek = default_llseek, }; static struct dentry *event_trace_events_dir(void) @@ -833,8 +1011,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + if (strcmp(system->name, name) == 0) { + system->nr_events++; return system->entry; + } } /* need to create new entry */ @@ -853,6 +1033,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } + system->nr_events = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -880,9 +1061,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -894,24 +1075,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; + struct list_head *head; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->system, d_events); - - if (call->raw_init) { - ret = call->raw_init(); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } - } + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + d_events = event_subsystem_dir(call->class->system, d_events); call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { @@ -920,35 +1092,138 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + if (call->class->reg) + trace_create_file("enable", 0644, call->dir, call, + enable); - if (call->id) - entry = trace_create_file("id", 0444, call->dir, call, - id); +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg) + trace_create_file("id", 0444, call->dir, call, + id); +#endif - if (call->define_fields) { - ret = call->define_fields(); + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); } + trace_create_file("filter", 0644, call->dir, call, + filter); - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } +static int +__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) +{ + struct dentry *d_events; + int ret; + + /* The linker may leave blanks */ + if (!call->name) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace events/%s\n", + call->name); + return ret; + } + } + + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, id, enable, filter, format); + if (!ret) + list_add(&call->list, &ftrace_events); + call->mod = mod; + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + mutex_unlock(&event_mutex); + return ret; +} + +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + struct event_filter *filter = system->filter; + + debugfs_remove_recursive(system->entry); + list_del(&system->list); + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); + } + break; + } + } +} + +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->class->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ @@ -1010,7 +1285,6 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; - struct dentry *d_events; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1018,29 +1292,14 @@ static void trace_module_add_events(struct module *mod) if (start == end) return; - d_events = event_trace_events_dir(); - if (!d_events) + file_ops = trace_create_file_ops(mod); + if (!file_ops) return; for_each_event(call, start, end) { - /* The linker may leave blanks */ - if (!call->name) - continue; - - /* - * This module has events, create file ops for this module - * if not already done. - */ - if (!file_ops) { - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - } - call->mod = mod; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + __trace_add_event_call(call, mod, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); } } @@ -1054,13 +1313,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); + __trace_remove_event_call(call); } } @@ -1109,7 +1362,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; @@ -1117,6 +1370,18 @@ struct notifier_block trace_module_nb = { extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = 1; + tracing_selftest_disabled = 1; + + return 1; +} +__setup("trace_event=", setup_trace_event); + static __init int event_trace_init(void) { struct ftrace_event_call *call; @@ -1124,6 +1389,8 @@ static __init int event_trace_init(void) struct dentry *entry; struct dentry *d_events; int ret; + char *buf = bootup_event_buf; + char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1159,14 +1426,27 @@ static __init int event_trace_init(void) trace_create_file("enable", 0644, d_events, NULL, &ftrace_system_enable_fops); + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - /* The linker may leave blanks */ - if (!call->name) + __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + } + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) continue; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warning("Failed to enable trace event: %s\n", token); } ret = register_module_notifier(&trace_module_nb); @@ -1241,17 +1521,29 @@ static __init void event_trace_self_tests(void) list_for_each_entry(call, &ftrace_events, list) { - /* Only test those that have a regfunc */ - if (!call->regfunc) + /* Only test those that have a probe */ + if (!call->class || !call->class->probe) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->class->system && + strcmp(call->class->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->enabled) { + if (call->flags & TRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; @@ -1318,30 +1610,31 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) { struct ring_buffer_event *event; + struct ring_buffer *buffer; struct ftrace_entry *entry; unsigned long flags; long disabled; - int resched; int cpu; int pc; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; local_save_flags(flags); - event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + event = trace_current_buffer_lock_reserve(&buffer, + TRACE_FN, sizeof(*entry), flags, pc); if (!event) goto out; @@ -1349,11 +1642,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_nowake_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); - ftrace_preempt_enable(resched); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = @@ -1376,10 +1669,10 @@ static __init void event_trace_self_test_with_function(void) static __init int event_trace_self_tests_init(void) { - - event_trace_self_tests(); - - event_trace_self_test_with_function(); + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 936c621bbf4..36d40104b17 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,11 +18,11 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ -#include <linux/debugfs.h> -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> +#include <linux/slab.h> #include "trace.h" #include "trace_output.h" @@ -31,6 +31,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -48,16 +49,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -121,6 +123,47 @@ struct filter_parse_state { } operand; }; +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); @@ -156,9 +199,24 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; + + return match; +} + +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event, + int val1, int val2) +{ + char **addr = (char **)(event + pred->offset); + int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ + + cmp = pred->regex.match(*addr, &pred->regex, len); + + match = cmp ^ pred->not; return match; } @@ -176,13 +234,15 @@ static int filter_pred_string(struct filter_pred *pred, void *event, static int filter_pred_strloc(struct filter_pred *pred, void *event, int val1, int val2) { - unsigned short str_loc = *(unsigned short *)(event + pred->offset); + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -193,10 +253,133 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, r->len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strnstr(str, r->pattern, len)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + int strlen = len - 1; + + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -293,7 +476,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) struct event_filter *filter = call->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -306,7 +489,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, struct event_filter *filter = system->filter; mutex_lock(&event_mutex); - if (filter->filter_string) + if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else trace_seq_printf(s, "none\n"); @@ -314,11 +497,11 @@ void print_subsystem_event_filter(struct event_subsystem *system, } static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) +__find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; - list_for_each_entry(field, &call->fields, link) { + list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } @@ -326,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static void filter_free_pred(struct filter_pred *pred) { if (!pred) @@ -339,7 +536,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -362,18 +559,20 @@ static void filter_disable_preds(struct ftrace_event_call *call) struct event_filter *filter = call->filter; int i; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; + if (!filter) + return; + for (i = 0; i < MAX_FILTER_PRED; i++) { if (filter->preds[i]) filter_free_pred(filter->preds[i]); @@ -381,20 +580,25 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->flags &= ~TRACE_EVENT_FL_FILTERED; } -int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); - call->filter_active = 0; filter->n_preds = 0; filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); @@ -409,46 +613,62 @@ int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} + +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->flags &= ~TRACE_EVENT_FL_FILTERED; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); - return -ENOMEM; + return 0; } -EXPORT_SYMBOL_GPL(init_preds); -static void filter_free_subsystem_preds(struct event_subsystem *system) +static int init_subsystem_preds(struct event_subsystem *system) { - struct event_filter *filter = system->filter; struct ftrace_event_call *call; - int i; + int err; - if (filter->n_preds) { - for (i = 0; i < filter->n_preds; i++) - filter_free_pred(filter->preds[i]); - kfree(filter->preds); - filter->preds = NULL; - filter->n_preds = 0; + list_for_each_entry(call, &ftrace_events, list) { + if (strcmp(call->class->system, system->name) != 0) + continue; + + err = init_preds(call); + if (err) + return err; } + return 0; +} + +static void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call; + list_for_each_entry(call, &ftrace_events, list) { - if (!call->define_fields) + if (strcmp(call->class->system, system->name) != 0) continue; - if (!strcmp(call->system, system->name)) { - filter_disable_preds(call); - remove_filter_string(call->filter); - } + filter_disable_preds(call); + remove_filter_string(call->filter); } } static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -463,17 +683,11 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } -enum { - FILTER_STATIC_STRING = 1, - FILTER_DYN_STRING -}; - -static int is_string_field(const char *type) +int filter_assign_type(const char *type) { if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; @@ -481,12 +695,22 @@ static int is_string_field(const char *type) if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - return 0; + return FILTER_OTHER; +} + +static bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; } static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -537,22 +761,25 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, - struct filter_pred *pred) + struct event_filter *filter, + struct filter_pred *pred, + bool dry_run) { struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; - int string_type; int ret; pred->fn = filter_pred_none; if (pred->op == OP_AND) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_and); + fn = filter_pred_and; + goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - return filter_add_pred_fn(ps, call, pred, filter_pred_or); + fn = filter_pred_or; + goto add_pred_fn; } field = find_event_field(call, pred->field_name); @@ -568,83 +795,42 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - string_type = is_string_field(field->type); - if (string_type) { - if (string_type == FILTER_STATIC_STRING) + if (is_string_field(field)) { + filter_build_regex(pred); + + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - pred->str_len = field->size; - if (pred->op == OP_NE) - pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); + else + fn = filter_pred_pchar; } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; } pred->val = val; - } - fn = select_comparison_fn(pred->op, field->size, field->is_signed); - if (!fn) { - parse_error(ps, FILT_ERR_INVALID_OP, 0); - return -EINVAL; + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } } if (pred->op == OP_NE) pred->not = 1; - return filter_add_pred_fn(ps, call, pred, fn); -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string) -{ - struct event_filter *filter = system->filter; - struct ftrace_event_call *call; - int err = 0; - - if (!filter->preds) { - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - - if (!filter->preds) - return -ENOMEM; - } - - if (filter->n_preds == MAX_FILTER_PRED) { - parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; - } - - filter->preds[filter->n_preds] = pred; - filter->n_preds++; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - err = filter_add_pred(ps, call, pred); - if (err) { - filter_free_subsystem_preds(system); - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - goto out; - } - replace_filter_string(call->filter, filter_string); - } -out: - return err; +add_pred_fn: + if (!dry_run) + return filter_add_pred_fn(ps, call, filter, pred, fn); + return 0; } static void parse_init(struct filter_parse_state *ps, @@ -844,8 +1030,9 @@ static void postfix_clear(struct filter_parse_state *ps) while (!list_empty(&ps->postfix)) { elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - kfree(elt->operand); list_del(&elt->list); + kfree(elt->operand); + kfree(elt); } } @@ -955,8 +1142,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; @@ -1000,15 +1187,17 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, - char *filter_string) + char *filter_string, + bool dry_run) { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; int err; + int n_preds = 0; err = check_preds(ps); if (err) @@ -1027,19 +1216,14 @@ static int replace_preds(struct event_subsystem *system, continue; } + if (n_preds++ == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + if (elt->op == OP_AND || elt->op == OP_OR) { pred = create_logical_pred(elt->op); - if (call) { - err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else - err = filter_add_subsystem_pred(ps, system, - pred, filter_string); - if (err) - return err; - - operand1 = operand2 = NULL; - continue; + goto add_pred; } if (!operand1 || !operand2) { @@ -1048,12 +1232,11 @@ static int replace_preds(struct event_subsystem *system, } pred = create_pred(elt->op, operand1, operand2); - if (call) { - err = filter_add_pred(ps, call, pred); - filter_free_pred(pred); - } else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string); +add_pred: + if (!pred) + return -ENOMEM; + err = filter_add_pred(ps, call, filter, pred, dry_run); + filter_free_pred(pred); if (err) return err; @@ -1063,19 +1246,59 @@ static int replace_preds(struct event_subsystem *system, return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->flags |= TRACE_EVENT_FL_FILTERED; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); + err = init_preds(call); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1093,10 +1316,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->flags |= TRACE_EVENT_FL_FILTERED; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1111,16 +1335,18 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); + err = init_subsystem_preds(system); + if (err) + goto out_unlock; + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1128,7 +1354,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); parse_init(ps, filter_ops, filter_string); @@ -1138,7 +1363,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - err = replace_preds(system, NULL, ps, filter_string); + err = replace_system_preds(system, ps, filter_string); if (err) append_filter_err(ps, system->filter); @@ -1152,3 +1377,73 @@ out_unlock: return err; } +#ifdef CONFIG_PERF_EVENTS + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->event.type == event_id) + break; + } + + err = -EINVAL; + if (&call->list == &ftrace_events) + goto out_unlock; + + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; + } + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#endif /* CONFIG_PERF_EVENTS */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d06cf898dc8..4b74d71705c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,192 +15,159 @@ #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args - -extern void __bad_type_size(void); - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - if (sizeof(type) != sizeof(field.item)) \ - __bad_type_size(); \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ - "offset:%u;\tsize:0;\n", \ - (unsigned int)offsetof(typeof(field), item)); \ - if (!ret) \ - return 0; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) - -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct args field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct args field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} +#undef __field +#define __field(type, item) type item; -#include "trace_event_types.h" +#undef __field_desc +#define __field_desc(type, container, item) type item; -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) +#undef __array +#define __array(type, item, size) type item[size]; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef F_STRUCT +#define F_STRUCT(args...) args -#undef TP_CMD -#define TP_CMD(cmd...) cmd +#undef F_printk +#define F_printk(fmt, args...) fmt, args -#undef TRACE_ENTRY -#define TRACE_ENTRY entry +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ +} -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(void); \ -static int ftrace_raw_init_event_##call(void); \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ -}; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ - return 0; \ -} \ - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .show_format = ftrace_format_##call, \ -}; +#include "trace_entries.h" -#include "trace_event_types.h" - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ +#undef __field +#define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + do { \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + mutex_lock(&event_storage_mutex); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ + mutex_unlock(&event_storage_mutex); \ + if (ret) \ + return ret; \ + } while (0); + +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ +#undef __dynamic_array +#define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), is_signed); \ + 0, is_signed_type(type), FILTER_OTHER);\ if (ret) \ return ret; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ int \ -ftrace_define_fields_##call(void) \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ - struct ftrace_event_call *event_call = &event_##call; \ - struct args field; \ + struct struct_name field; \ int ret; \ \ - __common_field(unsigned char, type, 0); \ - __common_field(unsigned char, flags, 0); \ - __common_field(unsigned char, preempt_count, 0); \ - __common_field(int, pid, 1); \ - __common_field(int, tgid, 1); \ - \ tstruct; \ \ return ret; \ } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) +#include "trace_entries.h" + +#undef __entry +#define __entry REC + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + \ +struct ftrace_event_class event_class_ftrace_##call = { \ + .system = __stringify(TRACE_SYSTEM), \ + .define_fields = ftrace_define_fields_##call, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ +}; \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .event.type = etype, \ + .class = &event_class_ftrace_##call, \ + .print_fmt = print, \ +}; \ -#include "trace_event_types.h" +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 90f13476483..16aee4d44e8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu, resched; + int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) trace_function(tr, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static void @@ -288,11 +288,9 @@ static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - char str[KSYM_SYMBOL_LEN]; long count = (long)data; - kallsyms_lookup(ip, NULL, NULL, NULL, str); - seq_printf(m, "%s:", str); + seq_printf(m, "%ps:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); @@ -302,8 +300,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, if (count == -1) seq_printf(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld", count); - seq_putc(m, '\n'); + seq_printf(m, ":count=%ld\n", count); return 0; } @@ -364,7 +361,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) out_reg: ret = register_ftrace_function_probe(glob, ops, count); - return ret; + return ret < 0 ? ret : 0; } static struct ftrace_func_command ftrace_traceon_cmd = { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d2249abafb5..76b05980225 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -9,14 +9,31 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" #include "trace_output.h" -struct fgraph_data { +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + +struct fgraph_cpu_data { pid_t last_pid; int depth; + int depth_irq; + int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; +}; + +struct fgraph_data { + struct fgraph_cpu_data __percpu *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; }; #define TRACE_GRAPH_INDENT 2 @@ -27,7 +44,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -42,17 +60,19 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, /* Display absolute time of an entry */ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts }; -/* pid on the last trace processed */ +static struct trace_array *graph_array; /* Add a function return address to the trace stack on thread info.*/ @@ -124,7 +144,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %pF return to %lx\n", + " from func %ps return to %lx\n", current->ret_stack[index].fp, frame_pointer, (void *)current->ret_stack[index].func, @@ -166,10 +186,183 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return 0; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); + + return 1; +} + +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (!ftrace_trace_task(current)) + return 0; + + /* trace it when it is-nested-in or is a function enabled. */ + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + +void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + +void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + static int graph_trace_init(struct trace_array *tr) { - int ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + int ret; + + set_graph_array(tr); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -183,43 +376,19 @@ static void graph_trace_reset(struct trace_array *tr) unregister_ftrace_graph(); } -static inline int log10_cpu(int nb) -{ - if (nb / 100) - return 3; - if (nb / 10) - return 2; - return 1; -} +static int max_bytes_for_cpu; static enum print_line_t print_graph_cpu(struct trace_seq *s, int cpu) { - int i; int ret; - int log10_this = log10_cpu(cpu); - int log10_all = log10_cpu(cpumask_weight(cpu_online_mask)); - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " "); - - /* - * Tricky - we space the CPU field according to the max - * number of online CPUs. On a 2-cpu system it would take - * a maximum of 1 digit - on a 128 cpu system it would - * take up to 3 digits: - */ - for (i = 0; i < log10_all - log10_this; i++) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "%d) ", cpu); + ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -270,6 +439,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid) } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + if (!trace_seq_putc(s, ' ')) + return 0; + + return trace_print_lat_fmt(s, entry); +} + /* If the pid changed since the last trace, output this event */ static enum print_line_t verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -281,7 +459,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (!data) return TRACE_TYPE_HANDLED; - last_pid = &(per_cpu_ptr(data, cpu)->last_pid); + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -332,27 +510,59 @@ static struct ftrace_graph_ret_entry * get_return_for_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *curr) { - struct ring_buffer_iter *ring_iter; + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *next; - ring_iter = iter->buffer_iter[iter->cpu]; + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { + + ring_iter = iter->buffer_iter[iter->cpu]; + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, + NULL, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL, NULL); + } - /* First peek to compare current entry and the next one */ - if (ring_iter) - event = ring_buffer_iter_peek(ring_iter, NULL); - else { - /* We need to consume the current entry to see the next one */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); + if (!event) + return NULL; + + next = ring_buffer_event_data(event); + + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; + } } - if (!event) - return NULL; - - next = ring_buffer_event_data(event); - if (next->ent.type != TRACE_GRAPH_RET) return NULL; @@ -369,17 +579,18 @@ get_return_for_leaf(struct trace_iterator *iter, /* Signal a overhead of time execution to the output */ static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s) +print_graph_overhead(unsigned long long duration, struct trace_seq *s, + u32 flags) { /* If duration disappear, we don't need anything */ - if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) return 1; /* Non nested entry or return */ if (duration == -1) return trace_seq_printf(s, " "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { /* Duration exceeded 100 msecs */ if (duration > 100000ULL) return trace_seq_printf(s, "! "); @@ -405,7 +616,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s) static enum print_line_t print_graph_irq(struct trace_iterator *iter, unsigned long addr, - enum trace_type type, int cpu, pid_t pid) + enum trace_type type, int cpu, pid_t pid, u32 flags) { int ret; struct trace_seq *s = &iter->seq; @@ -415,20 +626,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_UNHANDLED; /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + if (flags & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } + /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + if (flags & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -438,7 +650,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_overhead(-1, s); + ret = print_graph_overhead(-1, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -451,7 +663,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_PARTIAL_LINE; /* Don't close the duration column if haven't one */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + if (flags & TRACE_GRAPH_PRINT_DURATION) trace_seq_printf(s, " |"); ret = trace_seq_printf(s, "\n"); @@ -481,7 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -521,7 +735,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *entry, - struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) + struct ftrace_graph_ret_entry *ret_entry, + struct trace_seq *s, u32 flags) { struct fgraph_data *data = iter->private; struct ftrace_graph_ret *graph_ret; @@ -535,24 +750,30 @@ print_graph_entry_leaf(struct trace_iterator *iter, duration = graph_ret->rettime - graph_ret->calltime; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments * equal to this depth. */ - *depth = call->depth - 1; + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = 0; } /* Overhead */ - ret = print_graph_overhead(duration, s); + ret = print_graph_overhead(duration, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = print_graph_duration(duration, s); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -565,11 +786,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "();\n"); + ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -579,7 +796,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, static enum print_line_t print_graph_entry_nested(struct trace_iterator *iter, struct ftrace_graph_ent_entry *entry, - struct trace_seq *s, int cpu) + struct trace_seq *s, int cpu, u32 flags) { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; @@ -587,19 +804,24 @@ print_graph_entry_nested(struct trace_iterator *iter, int i; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); - *depth = call->depth; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = call->func; } /* No overhead */ - ret = print_graph_overhead(-1, s); + ret = print_graph_overhead(-1, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* No time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -612,11 +834,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, call->func, 0); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, "() {\n"); + ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -629,7 +847,7 @@ print_graph_entry_nested(struct trace_iterator *iter, static enum print_line_t print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, - int type, unsigned long addr) + int type, unsigned long addr, u32 flags) { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; @@ -642,27 +860,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (type) { /* Interrupt */ - ret = print_graph_irq(iter, addr, type, cpu, ent->pid); + ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + if (flags & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + if (flags & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, ent->pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -672,61 +890,201 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + return 0; } +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just extered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter) + struct trace_iterator *iter, u32 flags) { - int cpu = iter->cpu; + struct fgraph_data *data = iter->private; struct ftrace_graph_ent *call = &field->graph_ent; struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; + + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; - if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) - return print_graph_entry_leaf(iter, field, leaf_ret, s); + ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); else - return print_graph_entry_nested(iter, field, s, cpu); + ret = print_graph_entry_nested(iter, field, s, cpu, flags); + + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + return ret; } static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent, struct trace_iterator *iter) + struct trace_entry *ent, struct trace_iterator *iter, + u32 flags) { unsigned long long duration = trace->rettime - trace->calltime; struct fgraph_data *data = iter->private; pid_t pid = ent->pid; int cpu = iter->cpu; + int func_match = 1; int ret; int i; + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. This is the * return from a function, we now want the comments * to display at the same level of the bracket. */ - *depth = trace->depth - 1; + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } } - if (print_graph_prologue(iter, s, 0, 0)) + if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; /* Overhead */ - ret = print_graph_overhead(duration, s); + ret = print_graph_overhead(duration, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = print_graph_duration(duration, s); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -739,19 +1097,32 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. + */ + if (func_match) { + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Overrun */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + if (flags & TRACE_GRAPH_PRINT_OVERRUN) { ret = trace_seq_printf(s, " (Overruns: %lu)\n", trace->overrun); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } - ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); + ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -759,8 +1130,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } static enum print_line_t -print_graph_comment(struct trace_seq *s, struct trace_entry *ent, - struct trace_iterator *iter) +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter, u32 flags) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct fgraph_data *data = iter->private; @@ -770,18 +1141,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, int i; if (data) - depth = per_cpu_ptr(data, iter->cpu)->depth; + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; - if (print_graph_prologue(iter, s, 0, 0)) + if (print_graph_prologue(iter, s, 0, 0, flags)) return TRACE_TYPE_PARTIAL_LINE; /* No overhead */ - ret = print_graph_overhead(-1, s); + ret = print_graph_overhead(-1, s, flags); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* No time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + if (flags & TRACE_GRAPH_PRINT_DURATION) { ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -816,7 +1187,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (!event) return TRACE_TYPE_UNHANDLED; - ret = event->trace(iter, sym_flags); + ret = event->funcs->trace(iter, sym_flags, event); if (ret != TRACE_TYPE_HANDLED) return ret; } @@ -836,90 +1207,253 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function(struct trace_iterator *iter) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter, flags); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } switch (entry->type) { case TRACE_GRAPH_ENT: { - struct ftrace_graph_ent_entry *field; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter); + saved = *field; + return print_graph_entry(&saved, s, iter, flags); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter); + return print_graph_return(&field->ret, s, entry, iter, flags); } + case TRACE_STACK: + case TRACE_FN: + /* dont trace stack and functions as comments */ + return TRACE_TYPE_UNHANDLED; + default: - return print_graph_comment(s, entry, iter); + return print_graph_comment(s, entry, iter, flags); } return TRACE_TYPE_HANDLED; } -static void print_graph_headers(struct seq_file *s) +static enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); +} + +static enum print_line_t +print_graph_function_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return print_graph_function(iter); +} + +static void print_lat_header(struct seq_file *s, u32 flags) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (flags & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (flags & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); +} + +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s, flags); + /* 1st line */ - seq_printf(s, "# "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, "#"); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU"); - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, " CPU"); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "|||||"); + if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "# "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, "#"); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, " | "); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "|||||"); + if (flags & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); } -static void graph_trace_open(struct trace_iterator *iter) +void print_graph_headers(struct seq_file *s) +{ + print_graph_headers_flags(s, tracer_flags.val); +} + +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + +void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ - struct fgraph_data *data = alloc_percpu(struct fgraph_data); + struct fgraph_data *data; int cpu; + iter->private = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - pr_warning("function graph tracer: not enough memory\n"); - else - for_each_possible_cpu(cpu) { - pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); - int *depth = &(per_cpu_ptr(data, cpu)->depth); - *pid = -1; - *depth = 0; - } + goto out_err; + + data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + *pid = -1; + *depth = 0; + *ignore = 0; + *depth_irq = -1; + } iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); +} + +void graph_trace_close(struct trace_iterator *iter) +{ + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } } -static void graph_trace_close(struct trace_iterator *iter) +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) { - free_percpu(iter->private); + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; } +static struct trace_event_functions graph_functions = { + .trace = print_graph_function_event, +}; + +static struct trace_event graph_trace_entry_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &graph_functions, +}; + +static struct trace_event graph_trace_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &graph_functions +}; + static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, + .pipe_open = graph_trace_open, .close = graph_trace_close, + .pipe_close = graph_trace_close, .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, + .set_flag = func_graph_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif @@ -927,6 +1461,18 @@ static struct tracer graph_trace __read_mostly = { static __init int init_graph_trace(void) { + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + + if (!register_ftrace_event(&graph_trace_entry_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + + if (!register_ftrace_event(&graph_trace_ret_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + return register_tracer(&graph_trace); } diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c deleted file mode 100644 index ca7d7c4d0c2..00000000000 --- a/kernel/trace/trace_hw_branches.c +++ /dev/null @@ -1,309 +0,0 @@ -/* - * h/w branch tracer for x86 based on BTS - * - * Copyright (C) 2008-2009 Intel Corporation. - * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/fs.h> - -#include <asm/ds.h> - -#include "trace_output.h" -#include "trace.h" - - -#define BTS_BUFFER_SIZE (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); - -#define this_tracer per_cpu(tracer, smp_processor_id()) - -static int trace_hw_branches_enabled __read_mostly; -static int trace_hw_branches_suspended __read_mostly; -static struct trace_array *hw_branch_trace __read_mostly; - - -static void bts_trace_init_cpu(int cpu) -{ - per_cpu(tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - - if (IS_ERR(per_cpu(tracer, cpu))) - per_cpu(tracer, cpu) = NULL; -} - -static int bts_trace_init(struct trace_array *tr) -{ - int cpu; - - hw_branch_trace = tr; - trace_hw_branches_enabled = 0; - - get_online_cpus(); - for_each_online_cpu(cpu) { - bts_trace_init_cpu(cpu); - - if (likely(per_cpu(tracer, cpu))) - trace_hw_branches_enabled = 1; - } - trace_hw_branches_suspended = 0; - put_online_cpus(); - - /* If we could not enable tracing on a single cpu, we fail. */ - return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; - } - } - trace_hw_branches_enabled = 0; - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); - trace_hw_branches_suspended = 1; - put_online_cpus(); -} - -static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - /* The notification is sent with interrupts enabled. */ - if (trace_hw_branches_enabled) { - bts_trace_init_cpu(cpu); - - if (trace_hw_branches_suspended && - likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); - } - break; - - case CPU_DOWN_PREPARE: - /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; - } - } - - return NOTIFY_DONE; -} - -static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { - .notifier_call = bts_hotcpu_handler -}; - -static void bts_trace_print_header(struct seq_file *m) -{ - seq_puts(m, "# CPU# TO <- FROM\n"); -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - unsigned long symflags = TRACE_ITER_SYM_OFFSET; - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct hw_branch_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", iter->cpu) && - seq_print_ip_sym(seq, it->to, symflags) && - trace_seq_printf(seq, "\t <- ") && - seq_print_ip_sym(seq, it->from, symflags) && - trace_seq_printf(seq, "\n")) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_hw_branch(u64 from, u64 to) -{ - struct ftrace_event_call *call = &event_hw_branch; - struct trace_array *tr = hw_branch_trace; - struct ring_buffer_event *event; - struct hw_branch_entry *entry; - unsigned long irq1; - int cpu; - - if (unlikely(!tr)) - return; - - if (unlikely(!trace_hw_branches_enabled)) - return; - - local_irq_save(irq1); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_HW_BRANCHES; - entry->from = from; - entry->to = to; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(irq1); -} - -static void trace_bts_at(const struct bts_trace *trace, void *at) -{ - struct bts_struct bts; - int err = 0; - - WARN_ON_ONCE(!trace->read); - if (!trace->read) - return; - - err = trace->read(this_tracer, at, &bts); - if (err < 0) - return; - - switch (bts.qualifier) { - case BTS_BRANCH: - trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); - break; - } -} - -/* - * Collect the trace on the current cpu and write it into the ftrace buffer. - * - * pre: tracing must be suspended on the current cpu - */ -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *)arg; - const struct bts_trace *trace; - unsigned char *at; - - if (unlikely(!tr)) - return; - - if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) - return; - - if (unlikely(!this_tracer)) - return; - - trace = ds_read_bts(this_tracer); - if (!trace) - return; - - for (at = trace->ds.top; (void *)at < trace->ds.end; - at += trace->ds.size) - trace_bts_at(trace, at); - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - at += trace->ds.size) - trace_bts_at(trace, at); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); - /* - * We need to collect the trace on the respective cpu since ftrace - * implicitly adds the record for the current cpu. - * Once that is more flexible, we could collect the data from any cpu. - */ - on_each_cpu(trace_bts_cpu, iter->tr, 1); - - for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); - put_online_cpus(); -} - -static void trace_bts_close(struct trace_iterator *iter) -{ - tracing_reset_online_cpus(iter->tr); -} - -void trace_hw_branch_oops(void) -{ - if (this_tracer) { - ds_suspend_bts_noirq(this_tracer); - trace_bts_cpu(hw_branch_trace); - ds_resume_bts_noirq(this_tracer); - } -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "hw-branch-tracer", - .init = bts_trace_init, - .reset = bts_trace_reset, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare, - .close = trace_bts_close, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_hw_branches, -#endif /* CONFIG_FTRACE_SELFTEST */ -}; - -__init static int init_bts_trace(void) -{ - register_hotcpu_notifier(&bts_hotcpu_notifier); - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b923d13e2fa..5cf8c602b88 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -34,6 +34,9 @@ static int trace_type __read_mostly; static int save_lat_flag; +static void stop_irqsoff_tracer(struct trace_array *tr, int graph); +static int start_irqsoff_tracer(struct trace_array *tr, int graph); + #ifdef CONFIG_PREEMPT_TRACER static inline int preempt_trace(void) @@ -55,6 +58,23 @@ irq_trace(void) # define irq_trace() (0) #endif +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + /* * Sequence count - we record it when starting a measurement and * skip the latency if the sequence has changed - some other section @@ -67,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -86,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -108,6 +156,132 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +{ + int cpu; + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_irqsoff_tracer(irqsoff_trace, !set); + + for_each_possible_cpu(cpu) + per_cpu(tracing_cpu, cpu) = 0; + + tracing_max_latency = 0; + tracing_reset_online_cpus(irqsoff_trace); + + return start_irqsoff_tracer(irqsoff_trace, set); +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + int ret; + int pc; + + if (!func_prolog_dec(tr, &data, &flags)) + return 0; + + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + return ret; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); +} + +static void irqsoff_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); + +} + +static void irqsoff_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} + +#else +#define __trace_function trace_function + +static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } +static void irqsoff_print_header(struct seq_file *s) { } +static void irqsoff_trace_open(struct trace_iterator *iter) { } +static void irqsoff_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -129,15 +303,10 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; int pc; - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -155,20 +324,19 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - - latency = nsecs_to_usecs(delta); + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, flags, 5, pc); if (data->critical_sequence != max_sequence) goto out_unlock; - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - data->critical_end = parent_ip; - update_max_tr_single(tr, current, cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } max_sequence++; @@ -178,8 +346,7 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(tr, cpu); - trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } static inline void @@ -208,11 +375,10 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - tracing_reset(tr, cpu); local_save_flags(flags); - trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, preempt_count()); per_cpu(tracing_cpu, cpu) = 1; @@ -246,7 +412,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, preempt_count()); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -355,19 +521,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static void start_irqsoff_tracer(struct trace_array *tr) +static int start_irqsoff_tracer(struct trace_array *tr, int graph) { - register_ftrace_function(&trace_ops); - if (tracing_is_enabled()) + int ret = 0; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&irqsoff_graph_return, + &irqsoff_graph_entry); + + if (!ret && tracing_is_enabled()) tracer_enabled = 1; else tracer_enabled = 0; + + return ret; } -static void stop_irqsoff_tracer(struct trace_array *tr) +static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); } static void __irqsoff_tracer_init(struct trace_array *tr) @@ -379,12 +558,15 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - start_irqsoff_tracer(tr); + tracing_reset_online_cpus(tr); + + if (start_irqsoff_tracer(tr, is_graph())) + printk(KERN_ERR "failed to start irqsoff tracer\n"); } static void irqsoff_tracer_reset(struct trace_array *tr) { - stop_irqsoff_tracer(tr); + stop_irqsoff_tracer(tr, is_graph()); if (!save_lat_flag) trace_flags &= ~TRACE_ITER_LATENCY_FMT; @@ -416,9 +598,16 @@ static struct tracer irqsoff_tracer __read_mostly = .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -442,9 +631,16 @@ static struct tracer preemptoff_tracer __read_mostly = .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -470,9 +666,16 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 00000000000..3c5c5dfea0b --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,135 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + if (!trace_empty(&iter)) + trace_find_next_entry_inc(&iter); + while (!trace_empty(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + print_trace_line(&iter); + if (!skip_lines) + trace_printk_seq(&iter.seq); + else + skip_lines--; + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + for_each_tracing_cpu(cpu) + if (iter.buffer_iter[cpu]) + ring_buffer_read_finish(iter.buffer_iter[cpu]); +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = TRACE_PIPE_ALL_CPU; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_REPEAT_NONE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 00000000000..2dec9bcde8b --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1847 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> +#include <linux/stringify.h> +#include <linux/limits.h> +#include <asm/bitsperlong.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + "common_lock_depth", + FIELD_STRING_IP, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, + void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, \ + void *data, void *ent)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; + void *data; +}; + +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) +{ + return fprm->fn(regs, fprm->data, dest); +} + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ +} +DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + if (!maxlen) + return; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); +} +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int ret, len = 0; + u8 c; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; + long offset; +}; + +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) + +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_END, +}; + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ + /* Fetch functions */ + fetch_func_t fetch[FETCH_MTD_END]; +} fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + return NULL; +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_param fetch; + struct fetch_param fetch_size; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_class class; + struct ftrace_event_call call; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* Check the name is good for event/group/fields */ +static int is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + int ret = -ENOMEM; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(ret); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event || !is_good_name(event)) { + ret = -EINVAL; + goto error; + } + + tp->call.class = &tp->class; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group || !is_good_name(group)) { + ret = -EINVAL; + goto error; + } + + tp->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->class.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(ret); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + free_symbol_cache(arg->fetch.data); + kfree(arg->name); + kfree(arg->comm); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.class->system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.class->system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.class->system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Failed to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) + f->fn = t->fetch[FETCH_MTD_retval]; + else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + f->fn = t->fetch[FETCH_MTD_stack]; + f->data = (void *)param; + } + } else + ret = -EINVAL; + } else + ret = -EINVAL; + return ret; +} + +/* Recursive argument parser */ +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, t, f, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + f->fn = t->fetch[FETCH_MTD_reg]; + f->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + f->fn = t->fetch[FETCH_MTD_memory]; + f->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->fetch[FETCH_MTD_symbol]; + } + break; + case '+': /* deref memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) + break; + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); + *tmp = '\0'; + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) + return -ENOMEM; + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); + if (ret) + kfree(dprm); + else { + f->fn = t->fetch[FETCH_MTD_deref]; + f->data = (void *)dprm; + } + } + break; + } + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", + t->name); + ret = -EINVAL; + } + return ret; +} + +/* String length checking wrapper */ +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) +{ + const char *t; + int ret; + + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0, is_delete = 0; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + /* argc must be >= 1 */ + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; + else { + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + } + if (!group) + group = KPROBE_EVENT_SYSTEM; + + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + mutex_lock(&probe_lock); + tp = find_probe_event(event, group); + if (!tp) { + mutex_unlock(&probe_lock); + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + mutex_unlock(&probe_lock); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Increment count for freeing args in error case */ + tp->nr_args++; + + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) { + *arg++ = '\0'; + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + } else { + arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + tp->args[i].name = kstrdup(buf, GFP_KERNEL); + } + + if (!tp->args[i].name) { + pr_info("Failed to allocate argument[%d] name.\n", i); + ret = -ENOMEM; + goto error; + } + + if (!is_good_name(tp->args[i].name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, tp->args[i].name); + ret = -EINVAL; + goto error; + } + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument[%d] name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + /* Parse fetch argument */ + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); + if (ret) { + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + goto error; + } + } + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); + + if (!tp->symbol) + seq_printf(m, " 0x%p", tp->rp.kp.addr); + else if (tp->rp.kp.offset) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " %s", probe_symbol(tp)); + + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); + seq_printf(m, "\n"); + + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Sum up total data length for dynamic arraies (strings) */ +static __kprobes int __get_data_size(struct trace_probe *tp, + struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, + struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +/* Kprobe handler */ +static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, dsize, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; + + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = (unsigned long)kp->addr; + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); +} + +/* Kretprobe handler */ +static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, pc, dsize; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; + + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + u8 *data; + int i; + + field = (struct kprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset, field)) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kretprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + u8 *data; + int i; + + field = (struct kretprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset, field)) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry_head field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->fmttype, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry_head field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->fmttype, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!probe_is_return(tp)) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_print_fmt(struct trace_probe *tp) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1); + tp->call.print_fmt = print_fmt; + + return 0; +} + +#ifdef CONFIG_PERF_EVENTS + +/* Kprobe profile handler */ +static __kprobes void kprobe_perf_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "profile buffer not large enough")) + return; + + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + if (!entry) + return; + + entry->ip = (unsigned long)kp->addr; + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); +} + +/* Kretprobe profile handler */ +static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "profile buffer not large enough")) + return; + + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + if (!entry) + return; + + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); +} + +static int probe_perf_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_perf_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_PERF_EVENTS */ + +static __kprobes +int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(event); + case TRACE_REG_UNREGISTER: + probe_event_disable(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_perf_enable(event); + case TRACE_REG_PERF_UNREGISTER: + probe_perf_disable(event); + return 0; +#endif + } + return 0; +} + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_PERF_EVENTS + if (tp->flags & TP_FLAG_PROFILE) + kprobe_perf_func(kp, regs); +#endif + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_PERF_EVENTS + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_perf_func(ri, regs); +#endif + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static struct trace_event_functions kretprobe_funcs = { + .trace = print_kretprobe_event +}; + +static struct trace_event_functions kprobe_funcs = { + .trace = print_kprobe_event +}; + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); + if (probe_is_return(tp)) { + call->event.funcs = &kretprobe_funcs; + call->class->define_fields = kretprobe_event_define_fields; + } else { + call->event.funcs = &kprobe_funcs; + call->class->define_fields = kprobe_event_define_fields; + } + if (set_print_fmt(tp) < 0) + return -ENOMEM; + ret = register_ftrace_event(&call->event); + if (!ret) { + kfree(call->print_fmt); + return -ENODEV; + } + call->flags = 0; + call->class->reg = kprobe_register; + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + kfree(call->print_fmt); + unregister_ftrace_event(&call->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); + kfree(tp->call.print_fmt); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret, warn = 0; + int (*target)(int, int, int, int, int, int); + struct trace_probe *tp; + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on probing function entry.\n"); + warn++; + } else { + /* Enable trace point */ + tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting new probe.\n"); + warn++; + } else + probe_event_enable(&tp->call); + } + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on probing function return.\n"); + warn++; + } else { + /* Enable trace point */ + tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting new probe.\n"); + warn++; + } else + probe_event_enable(&tp->call); + } + + if (warn) + goto end; + + ret = target(1, 2, 3, 4, 5, 6); + + ret = command_trace_probe("-:testprobe"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on deleting a probe.\n"); + warn++; + } + + ret = command_trace_probe("-:testprobe2"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on deleting a probe.\n"); + warn++; + } + +end: + cleanup_all_probes(); + if (warn) + pr_cont("NG: Some tests are failed. Please check them.\n"); + else + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index d53b45ed080..017fa376505 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/mmiotrace.h> #include <linux/pci.h> +#include <linux/slab.h> #include <linux/time.h> #include <asm/atomic.h> @@ -307,11 +308,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ftrace_event_call *call = &event_mmiotrace_rw; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); @@ -319,7 +322,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(tr, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -333,11 +338,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ftrace_event_call *call = &event_mmiotrace_map; + struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); @@ -345,7 +352,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(tr, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7938f3ae93e..02272baa220 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -16,21 +16,25 @@ DECLARE_RWSEM(trace_event_mutex); -DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); -EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); - static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -void trace_print_seq(struct seq_file *m, struct trace_seq *s) +int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + int ret; - s->buffer[len] = 0; - seq_puts(m, s->buffer); + ret = seq_write(m, s->buffer, len); - trace_seq_init(s); + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. + */ + if (!ret) + trace_seq_init(s); + + return ret; } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -70,6 +74,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -83,7 +90,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_list ap; int ret; - if (!len) + if (s->full || !len) return 0; va_start(ap, fmt); @@ -91,12 +98,14 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); @@ -117,14 +126,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = vsnprintf(s->buffer + s->len, len, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -137,14 +148,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = bstr_printf(s->buffer + s->len, len, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -165,9 +178,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, str, len); s->len += len; @@ -176,19 +194,30 @@ int trace_seq_puts(struct trace_seq *s, const char *str) int trace_seq_putc(struct trace_seq *s, unsigned char c) { - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; return 0; + } s->buffer[s->len++] = c; return 1; } +EXPORT_SYMBOL(trace_seq_putc); int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, mem, len); s->len += len; @@ -201,6 +230,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) const unsigned char *data = mem; int i, j; + if (s->full) + return 0; + #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -218,9 +250,14 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) { void *ret; - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return NULL; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return NULL; + } + ret = s->buffer + s->len; s->len += len; @@ -231,8 +268,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; return 0; + } + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); if (!IS_ERR(p)) { p = mangle_path(s->buffer + s->len, p, "\n"); @@ -245,6 +288,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 1; } + s->full = 1; return 0; } @@ -309,6 +353,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, } EXPORT_SYMBOL(ftrace_print_symbols_seq); +const char * +ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_hex_seq); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { @@ -371,6 +430,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long vmstart = 0; int ret = 1; + if (s->full) + return 0; + if (mm) { const struct vm_area_struct *vma; @@ -408,7 +470,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, * since individual threads might have already quit! */ rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); + task = find_task_by_vpid(entry->tgid); if (task) mm = get_task_mm(task); rcu_read_unlock(); @@ -461,18 +523,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; - char comm[TASK_COMM_LEN]; + int ret; - trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", - comm, entry->pid, cpu, + if (!trace_seq_printf(s, "%c%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', @@ -483,8 +550,31 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) return 0; if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + ret = trace_seq_printf(s, "%x", entry->preempt_count); + else + ret = trace_seq_putc(s, '.'); + + if (!ret) + return 0; + + if (entry->lock_depth < 0) + return trace_seq_putc(s, '.'); + + return trace_seq_printf(s, "%d", entry->lock_depth); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); } static unsigned long preempt_mark_thresh = 100; @@ -649,6 +739,9 @@ int register_ftrace_event(struct trace_event *event) if (WARN_ON(!event)) goto out; + if (WARN_ON(!event->funcs)) + goto out; + INIT_LIST_HEAD(&event->list); if (!event->type) { @@ -681,14 +774,14 @@ int register_ftrace_event(struct trace_event *event) goto out; } - if (event->trace == NULL) - event->trace = trace_nop_print; - if (event->raw == NULL) - event->raw = trace_nop_print; - if (event->hex == NULL) - event->hex = trace_nop_print; - if (event->binary == NULL) - event->binary = trace_nop_print; + if (event->funcs->trace == NULL) + event->funcs->trace = trace_nop_print; + if (event->funcs->raw == NULL) + event->funcs->raw = trace_nop_print; + if (event->funcs->hex == NULL) + event->funcs->hex = trace_nop_print; + if (event->funcs->binary == NULL) + event->funcs->binary = trace_nop_print; key = event->type & (EVENT_HASHSIZE - 1); @@ -730,13 +823,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); * Standard events */ -enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { return TRACE_TYPE_HANDLED; } /* TRACE_FN */ -static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -763,7 +858,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; @@ -777,7 +873,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -790,7 +887,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -803,14 +901,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static struct trace_event trace_fn_event = { - .type = TRACE_FN, +static struct trace_event_functions trace_fn_funcs = { .trace = trace_fn_trace, .raw = trace_fn_raw, .hex = trace_fn_hex, .binary = trace_fn_bin, }; +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .funcs = &trace_fn_funcs, +}; + /* TRACE_CTX an TRACE_WAKE */ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) @@ -839,13 +941,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_print(iter, "==>"); } static enum print_line_t trace_wake_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { return trace_ctxwake_print(iter, " +"); } @@ -858,7 +961,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, @@ -873,12 +976,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_raw(iter, 0); } -static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_raw(iter, '+'); } @@ -893,7 +998,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); @@ -907,18 +1012,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_hex(iter, 0); } -static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_hex(iter, '+'); } static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct ctx_switch_entry *field; struct trace_seq *s = &iter->seq; @@ -935,81 +1042,34 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static struct trace_event trace_ctx_event = { - .type = TRACE_CTX, +static struct trace_event_functions trace_ctx_funcs = { .trace = trace_ctx_print, .raw = trace_ctx_raw, .hex = trace_ctx_hex, .binary = trace_ctxwake_bin, }; -static struct trace_event trace_wake_event = { - .type = TRACE_WAKE, +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .funcs = &trace_ctx_funcs, +}; + +static struct trace_event_functions trace_wake_funcs = { .trace = trace_wake_print, .raw = trace_wake_raw, .hex = trace_wake_hex, .binary = trace_ctxwake_bin, }; -/* TRACE_SPECIAL */ -static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags) -{ - struct special_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, - .trace = trace_special_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .funcs = &trace_wake_funcs, }; /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct stack_entry *field; struct trace_seq *s = &iter->seq; @@ -1037,17 +1097,18 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } +static struct trace_event_functions trace_stack_funcs = { + .trace = trace_stack_print, +}; + static struct trace_event trace_stack_event = { .type = TRACE_STACK, - .trace = trace_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, + .funcs = &trace_stack_funcs, }; /* TRACE_USER_STACK */ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct userstack_entry *field; struct trace_seq *s = &iter->seq; @@ -1066,17 +1127,19 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } +static struct trace_event_functions trace_user_stack_funcs = { + .trace = trace_user_stack_print, +}; + static struct trace_event trace_user_stack_event = { .type = TRACE_USER_STACK, - .trace = trace_user_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, + .funcs = &trace_user_stack_funcs, }; /* TRACE_BPRINT */ static enum print_line_t -trace_bprint_print(struct trace_iterator *iter, int flags) +trace_bprint_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; @@ -1101,7 +1164,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags) static enum print_line_t -trace_bprint_raw(struct trace_iterator *iter, int flags) +trace_bprint_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct bprint_entry *field; struct trace_seq *s = &iter->seq; @@ -1120,16 +1184,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } +static struct trace_event_functions trace_bprint_funcs = { + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; static struct trace_event trace_bprint_event = { .type = TRACE_BPRINT, - .trace = trace_bprint_print, - .raw = trace_bprint_raw, + .funcs = &trace_bprint_funcs, }; /* TRACE_PRINT */ static enum print_line_t trace_print_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct print_entry *field; struct trace_seq *s = &iter->seq; @@ -1148,7 +1215,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct print_entry *field; @@ -1163,18 +1231,21 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_print_event = { - .type = TRACE_PRINT, +static struct trace_event_functions trace_print_funcs = { .trace = trace_print_print, .raw = trace_print_raw, }; +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .funcs = &trace_print_funcs, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, &trace_wake_event, - &trace_special_event, &trace_stack_event, &trace_user_stack_event, &trace_bprint_event, diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4a9c3..c038eba0492 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -25,7 +25,9 @@ extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, - int flags); + int flags, struct trace_event *event); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index 8a30d9874cd..00000000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven <arjan@linux.intel.com> - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <trace/power.h> -#include <linux/kallsyms.h> -#include <linux/module.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(tr, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(tr, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - int cpu; - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 9bece9687b6..2547d8813cf 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,7 +11,6 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> -#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> @@ -155,25 +154,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) EXPORT_SYMBOL_GPL(__ftrace_vprintk); static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = m->private; - const char **next = fmt; - - (*pos)++; + const char **fmt = __start___trace_bprintk_fmt + *pos; if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) return NULL; - - next = fmt; - m->private = ++next; - return fmt; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void *t_next(struct seq_file *m, void * v, loff_t *pos) { - return t_next(m, NULL, pos); + (*pos)++; + return t_start(m, pos); } static int t_show(struct seq_file *m, void *v) @@ -182,7 +175,7 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; - seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* * Tabs and new lines need to be converted. @@ -224,15 +217,7 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &show_format_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start___trace_bprintk_fmt; - } - return ret; + return seq_open(file, &show_format_seq_ops); } static const struct file_operations ftrace_formats_fops = { diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a98106dd979..8f758d070c4 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -20,9 +20,37 @@ static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); static int sched_stopped; + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + static void -probe_sched_switch(struct rq *__rq, struct task_struct *prev, - struct task_struct *next) +probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; unsigned long flags; @@ -49,8 +77,38 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, local_irq_restore(flags); } +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr->buffer, flags, 6, pc); + ftrace_trace_userstack(tr->buffer, flags, pc); +} + static void -probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) { struct trace_array_cpu *data; unsigned long flags; @@ -80,21 +138,21 @@ static int tracing_sched_register(void) { int ret; - ret = register_trace_sched_wakeup(probe_sched_wakeup); + ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } - ret = register_trace_sched_wakeup_new(probe_sched_wakeup); + ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = register_trace_sched_switch(probe_sched_switch); + ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); @@ -103,17 +161,17 @@ static int tracing_sched_register(void) return ret; fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_sched_wakeup); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: - unregister_trace_sched_wakeup(probe_sched_wakeup); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { - unregister_trace_sched_switch(probe_sched_switch); - unregister_trace_sched_wakeup_new(probe_sched_wakeup); - unregister_trace_sched_wakeup(probe_sched_wakeup); + unregister_trace_sched_switch(probe_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index eacb2722517..7319559ed59 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,66 +24,106 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; -static raw_spinlock_t wakeup_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER + /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; - int resched; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); - resched = ftrace_preempt_disable(); + *pc = preempt_count(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + if (cpu != wakeup_current_cpu) + goto out_enable; + + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + return 1; - if (unlikely(!wakeup_task)) - goto unlock; +out: + atomic_dec(&(*data)->disabled); - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; +out_enable: + preempt_enable_notrace(); + return 0; +} - trace_function(tr, ip, parent_ip, flags, pc); +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; - unlock: - __raw_spin_unlock(&wakeup_lock); + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -92,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -107,11 +297,19 @@ static int report_latency(cycle_t delta) return 1; } +static void +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + static void notrace -probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) +probe_wakeup_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) { - unsigned long latency = 0, t0 = 0, t1 = 0; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -145,7 +343,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ if (unlikely(!tracer_enabled || next != wakeup_task)) @@ -154,13 +352,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -168,17 +362,14 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - latency = nsecs_to_usecs(delta); - - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } out_unlock: __wakeup_reset(wakeup_trace); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); @@ -186,11 +377,6 @@ out: static void __wakeup_reset(struct trace_array *tr) { - int cpu; - - for_each_possible_cpu(cpu) - tracing_reset(tr, cpu); - wakeup_cpu = -1; wakeup_prio = -1; @@ -204,15 +390,17 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; + tracing_reset_online_cpus(tr); + local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); __wakeup_reset(tr); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); } static void -probe_wakeup(struct rq *rq, struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p, int success) { struct trace_array_cpu *data; int cpu = smp_processor_id(); @@ -237,7 +425,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) goto out; /* interrupts should be off from try_to_wake_up */ - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* check for races. */ if (!tracer_enabled || p->prio >= wakeup_prio) @@ -247,6 +435,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; wakeup_task = p; @@ -263,10 +452,10 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } @@ -275,27 +464,34 @@ static void start_wakeup_tracer(struct trace_array *tr) { int ret; - ret = register_trace_sched_wakeup(probe_wakeup); + ret = register_trace_sched_wakeup(probe_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return; } - ret = register_trace_sched_wakeup_new(probe_wakeup); + ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = register_trace_sched_switch(probe_wakeup_sched_switch); + ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + wakeup_reset(tr); /* @@ -307,27 +503,24 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: - unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup, NULL); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); - unregister_trace_sched_switch(probe_wakeup_sched_switch); - unregister_trace_sched_wakeup_new(probe_wakeup); - unregister_trace_sched_wakeup(probe_wakeup); + stop_func_tracer(is_graph()); + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); + unregister_trace_sched_wakeup(probe_wakeup, NULL); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } static int __wakeup_tracer_init(struct trace_array *tr) @@ -382,9 +575,16 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = 1, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -396,9 +596,16 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = 1, }; __init static int init_wakeup_tracer(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 00dd6485bdd..659732eba07 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -3,6 +3,7 @@ #include <linux/stringify.h> #include <linux/kthread.h> #include <linux/delay.h> +#include <linux/slab.h> static inline int trace_valid_entry(struct trace_entry *entry) { @@ -12,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_WAKE: case TRACE_STACK: case TRACE_PRINT: - case TRACE_SPECIAL: case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_HW_BRANCHES: return 1; } return 0; @@ -28,7 +27,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* @@ -66,7 +65,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -84,7 +83,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } tracing_on(); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_restore(flags); if (count) @@ -254,7 +253,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void __ftrace_dump(bool disable_tracing); +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -265,7 +265,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); if (ftrace_dump_on_oops) - __ftrace_dump(false); + __ftrace_dump(false, DUMP_ALL); return 0; } @@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * to detect and recover from possible hangs */ tracing_reset_online_cpus(tr); + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); if (ret) { @@ -557,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; + static const struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -688,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -#ifdef CONFIG_SYSPROF_TRACER -int -trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SYSPROF_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) @@ -752,58 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_HW_BRANCH_TRACER -int -trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr) -{ - struct trace_iterator *iter; - struct tracer tracer; - unsigned long count; - int ret; - - if (!trace->open) { - printk(KERN_CONT "missing open function..."); - return -1; - } - - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* - * The hw-branch tracer needs to collect the trace from the various - * cpu trace buffers - before tracing is stopped. - */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - memcpy(&tracer, trace, sizeof(tracer)); - - iter->trace = &tracer; - iter->tr = tr; - iter->pos = -1; - mutex_init(&iter->mutex); - - trace->open(iter); - - mutex_destroy(&iter->mutex); - kfree(iter); - - tracing_stop(); - - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT "no entries found.."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_HW_BRANCH_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2d7aebd71db..4c5dead0c23 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = { }; static unsigned long max_stack_size; -static raw_spinlock_t max_stack_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t max_stack_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); @@ -54,7 +54,7 @@ static inline void check_stack(void) return; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); /* a race could have already updated it */ if (this_size <= max_stack_size) @@ -103,19 +103,19 @@ static inline void check_stack(void) } out: - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } static void stack_trace_call(unsigned long ip, unsigned long parent_ip) { - int cpu, resched; + int cpu; if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + + arch_spin_lock(&max_stack_lock); *ptr = val; - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -183,66 +195,62 @@ static const struct file_operations stack_max_size_fops = { .open = tracing_open_generic, .read = stack_max_size_read, .write = stack_max_size_write, + .llseek = default_llseek, }; static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +__next(struct seq_file *m, loff_t *pos) { - long i; + long n = *pos - 1; - (*pos)++; - - if (v == SEQ_START_TOKEN) - i = 0; - else { - i = *(long *)v; - i++; - } - - if (i >= max_stack_trace.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; - m->private = (void *)i; - + m->private = (void *)n; return &m->private; } +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __next(m, pos); +} + static void *t_start(struct seq_file *m, loff_t *pos) { - void *t = SEQ_START_TOKEN; - loff_t l = 0; + int cpu; local_irq_disable(); - __raw_spin_lock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + + arch_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; - for (; t && l < *pos; t = t_next(m, t, &l)) - ; - - return t; + return __next(m, pos); } static void t_stop(struct seq_file *m, void *p) { - __raw_spin_unlock(&max_stack_lock); + int cpu; + + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - - sprint_symbol(str, addr); - return seq_printf(m, "%s\n", str); -#else - return seq_printf(m, "%p\n", (void*)addr); -#endif + return seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) @@ -301,35 +309,32 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &stack_trace_seq_ops); - - return ret; + return seq_open(file, &stack_trace_seq_ops); } static const struct file_operations stack_trace_fops = { .open = stack_trace_open, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || - (last_stack_tracer_enabled == stack_tracer_enabled)) + (last_stack_tracer_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = stack_tracer_enabled; + last_stack_tracer_enabled = !!stack_tracer_enabled; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index c00643733f4..96cffb269e7 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -10,6 +10,7 @@ #include <linux/list.h> +#include <linux/slab.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include "trace_stat.h" @@ -49,7 +50,8 @@ static struct dentry *stat_dir; * but it will at least advance closer to the next one * to be released. */ -static struct rb_node *release_next(struct rb_node *node) +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) { struct stat_node *snode; struct rb_node *parent = rb_parent(node); @@ -67,26 +69,35 @@ static struct rb_node *release_next(struct rb_node *node) parent->rb_right = NULL; snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); kfree(snode); return parent; } } -static void reset_stat_session(struct stat_session *session) +static void __reset_stat_session(struct stat_session *session) { struct rb_node *node = session->stat_root.rb_node; while (node) - node = release_next(node); + node = release_next(session->ts, node); session->stat_root = RB_ROOT; } +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); - reset_stat_session(session); + __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); } @@ -150,7 +161,7 @@ static int stat_seq_init(struct stat_session *session) int i; mutex_lock(&session->stat_mutex); - reset_stat_session(session); + __reset_stat_session(session); if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; @@ -183,7 +194,7 @@ exit: return ret; exit_free_rbtree: - reset_stat_session(session); + __reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; } @@ -193,23 +204,23 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; struct rb_node *node; + int n = *pos; int i; /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) { - (*pos)++; - return SEQ_START_TOKEN; + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; } node = rb_first(&session->stat_root); - for (i = 0; node && i < *pos; i++) + for (i = 0; node && i < n; i++) node = rb_next(node); - (*pos)++; - return node; } @@ -254,16 +265,21 @@ static const struct seq_operations trace_stat_seq_ops = { static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - + struct seq_file *m; struct stat_session *session = inode->i_private; + ret = stat_seq_init(session); + if (ret) + return ret; + ret = seq_open(file, &trace_stat_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = session; - ret = stat_seq_init(session); + if (ret) { + reset_stat_session(session); + return ret; } + m = file->private_data; + m->private = session; return ret; } @@ -274,11 +290,9 @@ static int tracing_stat_release(struct inode *i, struct file *f) { struct stat_session *session = i->i_private; - mutex_lock(&session->stat_mutex); reset_stat_session(session); - mutex_unlock(&session->stat_mutex); - return 0; + return seq_release(i, f); } static const struct file_operations tracing_stat_fops = { diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index f3546a2cd82..8f03914b9a6 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -18,6 +18,8 @@ struct tracer_stat { int (*stat_cmp)(void *p1, void *p2); /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); /* Print the headers of your stat entries */ int (*stat_headers)(struct seq_file *s); }; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e579645ac8..bac752f0cfb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,33 +1,109 @@ #include <trace/syscall.h> +#include <trace/events/syscalls.h> +#include <linux/slab.h> #include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/perf_event.h> #include <asm/syscall.h> #include "trace_output.h" #include "trace.h" -/* Keep a counter of the syscall tracing users */ -static int refcount; - -/* Prevent from races on thread flags toggling */ static DEFINE_MUTEX(syscall_trace_lock); +static int sys_refcount_enter; +static int sys_refcount_exit; +static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); + +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type); +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type); + +static int syscall_enter_define_fields(struct ftrace_event_call *call); +static int syscall_exit_define_fields(struct ftrace_event_call *call); + +/* All syscall exit events have the same fields */ +static LIST_HEAD(syscall_exit_fields); + +static struct list_head * +syscall_get_enter_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->enter_fields; +} + +static struct list_head * +syscall_get_exit_fields(struct ftrace_event_call *call) +{ + return &syscall_exit_fields; +} + +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; -/* Option to display the parameters types */ -enum { - TRACE_SYSCALLS_OPT_TYPES = 0x1, +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, }; -static struct tracer_opt syscalls_opts[] = { - { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, - { } +struct ftrace_event_class event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, }; -static struct tracer_flags syscalls_flags = { - .val = 0, /* By default: no parameters types */ - .opts = syscalls_opts +struct ftrace_event_class event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .get_fields = syscall_get_exit_fields, + .raw_init = init_syscall_trace, }; +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + enum print_line_t -print_syscall_enter(struct trace_iterator *iter, int flags) +print_syscall_enter(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -35,40 +111,52 @@ print_syscall_enter(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int i, ret, syscall; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) goto end; + if (entry->enter_event->event.type != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + ret = trace_seq_printf(s, "%s(", entry->name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; for (i = 0; i < entry->nb_args; i++) { /* parameter types */ - if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { + if (trace_flags & TRACE_ITER_VERBOSE) { ret = trace_seq_printf(s, "%s ", entry->types[i]); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], + ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], trace->args[i], - i == entry->nb_args - 1 ? ")" : ","); + i == entry->nb_args - 1 ? "" : ", "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + ret = trace_seq_putc(s, ')'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + end: - trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } enum print_line_t -print_syscall_exit(struct trace_iterator *iter, int flags) +print_syscall_exit(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -77,16 +165,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int ret; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) { trace_seq_printf(s, "\n"); return TRACE_TYPE_HANDLED; } + if (entry->exit_event->event.type != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); if (!ret) @@ -95,62 +187,127 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -void start_ftrace_syscalls(void) +extern char *__bad_type_size(void); + +#define SYSCALL_FIELD(type, name) \ + sizeof(type) != sizeof(trace.name) ? \ + __bad_type_size() : \ + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) + +static +int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { - unsigned long flags; - struct task_struct *g, *t; + int i; + int pos = 0; - mutex_lock(&syscall_trace_lock); + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); + } - /* Don't enable the flag on the tasks twice */ - if (++refcount != 1) - goto unlock; +#undef LEN_OR_ZERO - arch_init_ftrace_syscalls(); - read_lock_irqsave(&tasklist_lock, flags); + /* return the length of print_fmt */ + return pos; +} - do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); +static int set_syscall_print_fmt(struct ftrace_event_call *call) +{ + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; - read_unlock_irqrestore(&tasklist_lock, flags); + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; + return 0; + } -unlock: - mutex_unlock(&syscall_trace_lock); + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; } -void stop_ftrace_syscalls(void) +static void free_syscall_print_fmt(struct ftrace_event_call *call) { - unsigned long flags; - struct task_struct *g, *t; + struct syscall_metadata *entry = call->data; - mutex_lock(&syscall_trace_lock); + if (entry->enter_event == call) + kfree(call->print_fmt); +} - /* There are perhaps still some users */ - if (--refcount) - goto unlock; +static int syscall_enter_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta = call->data; + int ret; + int i; + int offset = offsetof(typeof(trace), args); + + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0, + FILTER_OTHER); + offset += sizeof(unsigned long); + } - read_lock_irqsave(&tasklist_lock, flags); + return ret; +} - do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); - } while_each_thread(g, t); +static int syscall_exit_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_exit trace; + int ret; - read_unlock_irqrestore(&tasklist_lock, flags); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; -unlock: - mutex_unlock(&syscall_trace_lock); + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + FILTER_OTHER); + + return ret; } -void ftrace_syscall_enter(struct pt_regs *regs) +void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; + struct ring_buffer *buffer; int size; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; + if (!test_bit(syscall_nr, enabled_enter_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) @@ -158,8 +315,8 @@ void ftrace_syscall_enter(struct pt_regs *regs) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, - 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->enter_event->event.type, size, 0, 0); if (!event) return; @@ -167,25 +324,31 @@ void ftrace_syscall_enter(struct pt_regs *regs) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(buffer, sys_data->enter_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(struct pt_regs *regs) +void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; + struct ring_buffer *buffer; int syscall_nr; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; + if (!test_bit(syscall_nr, enabled_exit_syscalls)) + return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, - sizeof(*entry), 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) return; @@ -193,58 +356,325 @@ void ftrace_syscall_exit(struct pt_regs *regs) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - trace_current_buffer_unlock_commit(event, 0, 0); - trace_wake_up(); + if (!filter_current_check_discard(buffer, sys_data->exit_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static int init_syscall_tracer(struct trace_array *tr) +int reg_event_syscall_enter(struct ftrace_event_call *call) { - start_ftrace_syscalls(); + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); + if (!ret) { + set_bit(num, enabled_enter_syscalls); + sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_event_syscall_enter(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_enter--; + clear_bit(num, enabled_enter_syscalls); + if (!sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, NULL); + mutex_unlock(&syscall_trace_lock); +} + +int reg_event_syscall_exit(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); + if (!ret) { + set_bit(num, enabled_exit_syscalls); + sys_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_event_syscall_exit(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_exit--; + clear_bit(num, enabled_exit_syscalls); + if (!sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, NULL); + mutex_unlock(&syscall_trace_lock); +} + +int init_syscall_trace(struct ftrace_event_call *call) +{ + int id; + + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + + id = trace_event_raw_init(call); + + if (id < 0) { + free_syscall_print_fmt(call); + return id; + } + + return id; +} + +unsigned long __init arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; +} + +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + syscalls_metadata[i] = meta; + } return 0; } +core_initcall(init_ftrace_syscalls); + +#ifdef CONFIG_PERF_EVENTS + +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; -static void reset_syscall_tracer(struct trace_array *tr) +static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { - stop_ftrace_syscalls(); - tracing_reset_online_cpus(tr); + struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + struct hlist_head *head; + int syscall_nr; + int rctx; + int size; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return; + + rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, + sys_data->enter_event->event.type, regs, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } -static struct trace_event syscall_enter_event = { - .type = TRACE_SYSCALL_ENTER, - .trace = print_syscall_enter, -}; +int perf_sysenter_enable(struct ftrace_event_call *call) +{ + int ret = 0; + int num; -static struct trace_event syscall_exit_event = { - .type = TRACE_SYSCALL_EXIT, - .trace = print_syscall_exit, -}; + num = ((struct syscall_metadata *)call->data)->syscall_nr; -static struct tracer syscall_tracer __read_mostly = { - .name = "syscall", - .init = init_syscall_tracer, - .reset = reset_syscall_tracer, - .flags = &syscalls_flags, -}; + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter, NULL); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} -__init int register_ftrace_syscalls(void) +void perf_sysenter_disable(struct ftrace_event_call *call) { - int ret; + int num; - ret = register_ftrace_event(&syscall_enter_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_enter_event.type); - WARN_ON_ONCE(1); + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter, NULL); + mutex_unlock(&syscall_trace_lock); +} + +static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_exit *rec; + struct hlist_head *head; + int syscall_nr; + int rctx; + int size; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + /* + * Impossible, but be paranoid with the future + * How to put this check outside runtime? + */ + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "exit event has grown above perf buffer size")) + return; + + rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, + sys_data->exit_event->event.type, regs, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); +} + +int perf_sysexit_enable(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate" + "syscall exit trace point"); + } else { + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; } + mutex_unlock(&syscall_trace_lock); + return ret; +} - ret = register_ftrace_event(&syscall_exit_event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - syscall_exit_event.type); - WARN_ON_ONCE(1); +void perf_sysexit_disable(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit, NULL); + mutex_unlock(&syscall_trace_lock); +} + +#endif /* CONFIG_PERF_EVENTS */ + +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_enter(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_enter(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysenter_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysenter_disable(event); + return 0; +#endif } + return 0; +} - return register_tracer(&syscall_tracer); +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_exit(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_exit(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysexit_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysexit_disable(event); + return 0; +#endif + } + return 0; } -device_initcall(register_ftrace_syscalls); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index f6693969287..00000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * trace stack traces - * - * Copyright (C) 2004-2008, Soeren Sandmann - * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> - * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/hrtimer.h> -#include <linux/uaccess.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/irq.h> -#include <linux/fs.h> - -#include <asm/stacktrace.h> - -#include "trace.h" - -static struct trace_array *sysprof_trace; -static int __read_mostly tracer_enabled; - -/* - * 1 msec sample interval by default: - */ -static unsigned long sample_period = 1000000; -static const unsigned int sample_max_depth = 512; - -static DEFINE_MUTEX(sample_timer_lock); -/* - * Per CPU hrtimers that do the profiling: - */ -static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - int ret; - - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; - - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; -} - -struct backtrace_info { - struct trace_array_cpu *data; - struct trace_array *tr; - int pos; -}; - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - /* Don't bother with IRQ stacks for now */ - return -1; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct backtrace_info *info = data; - - if (info->pos < sample_max_depth && reliable) { - __trace_special(info->tr, info->data, 1, addr, 0); - - info->pos++; - } -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, -}; - -static int -trace_kernel(struct pt_regs *regs, struct trace_array *tr, - struct trace_array_cpu *data) -{ - struct backtrace_info info; - unsigned long bp; - char *stack; - - info.tr = tr; - info.data = data; - info.pos = 1; - - __trace_special(info.tr, info.data, 1, regs->ip, 0); - - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = regs->bp; -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - - return info.pos; -} - -static void timer_notify(struct pt_regs *regs, int cpu) -{ - struct trace_array_cpu *data; - struct stack_frame frame; - struct trace_array *tr; - const void __user *fp; - int is_user; - int i; - - if (!regs) - return; - - tr = sysprof_trace; - data = tr->data[cpu]; - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - __trace_special(tr, data, 0, 0, current->pid); - - if (!is_user) - i = trace_kernel(regs, tr, data); - else - i = 0; - - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm && i < sample_max_depth) { - regs = (struct pt_regs *)current->thread.sp0 - 1; - - fp = (void __user *)regs->bp; - - __trace_special(tr, data, 2, regs->ip, 0); - - while (i < sample_max_depth) { - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - - i++; - } - - } - - /* - * Special trace entry if we overflow the max depth: - */ - if (i == sample_max_depth) - __trace_special(tr, data, -1, -1, -1); - - __trace_special(tr, data, 3, current->pid, i); -} - -static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -{ - /* trace here */ - timer_notify(get_irq_regs(), smp_processor_id()); - - hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - - return HRTIMER_RESTART; -} - -static void start_stack_timer(void *unused) -{ - struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); - - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = stack_trace_timer_fn; - - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); -} - -static void start_stack_timers(void) -{ - on_each_cpu(start_stack_timer, NULL, 1); -} - -static void stop_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_cancel(hrtimer); -} - -static void stop_stack_timers(void) -{ - int cpu; - - for_each_online_cpu(cpu) - stop_stack_timer(cpu); -} - -static void stop_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - tracer_enabled = 0; - mutex_unlock(&sample_timer_lock); -} - -static int stack_trace_init(struct trace_array *tr) -{ - sysprof_trace = tr; - - tracing_start_cmdline_record(); - - mutex_lock(&sample_timer_lock); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); - return 0; -} - -static void stack_trace_reset(struct trace_array *tr) -{ - tracing_stop_cmdline_record(); - stop_stack_trace(tr); -} - -static struct tracer stack_trace __read_mostly = -{ - .name = "sysprof", - .init = stack_trace_init, - .reset = stack_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sysprof, -#endif -}; - -__init static int init_stack_trace(void) -{ - return register_tracer(&stack_trace); -} -device_initcall(init_stack_trace); - -#define MAX_LONG_DIGITS 22 - -static ssize_t -sysprof_sample_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - int r; - - r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -sysprof_sample_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - unsigned long val; - - if (cnt > MAX_LONG_DIGITS-1) - cnt = MAX_LONG_DIGITS-1; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - val = simple_strtoul(buf, NULL, 10); - /* - * Enforce a minimum sample period of 100 usecs: - */ - if (val < 100) - val = 100; - - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - sample_period = val * 1000; - start_stack_timers(); - mutex_unlock(&sample_timer_lock); - - return cnt; -} - -static const struct file_operations sysprof_sample_fops = { - .read = sysprof_sample_read, - .write = sysprof_sample_write, -}; - -void init_tracer_sysprof_debugfs(struct dentry *d_tracer) -{ - - trace_create_file("sysprof_sample_period", 0644, - d_tracer, NULL, &sysprof_sample_fops); -} diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 97fcea4acce..209b379a472 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -9,6 +9,8 @@ #include <trace/events/workqueue.h> #include <linux/list.h> #include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/kref.h> #include "trace_stat.h" #include "trace.h" @@ -16,6 +18,7 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; + struct kref kref; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -39,9 +42,15 @@ struct workqueue_global_stats { static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) +static void cpu_workqueue_stat_free(struct kref *kref) +{ + kfree(container_of(kref, struct cpu_workqueue_stats, kref)); +} + /* Insertion of a work */ static void -probe_workqueue_insertion(struct task_struct *wq_thread, +probe_workqueue_insertion(void *ignore, + struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -62,7 +71,8 @@ found: /* Execution of a work */ static void -probe_workqueue_execution(struct task_struct *wq_thread, +probe_workqueue_execution(void *ignore, + struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -82,7 +92,8 @@ found: } /* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) +static void probe_workqueue_creation(void *ignore, + struct task_struct *wq_thread, int cpu) { struct cpu_workqueue_stats *cws; unsigned long flags; @@ -96,8 +107,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) return; } INIT_LIST_HEAD(&cws->list); + kref_init(&cws->kref); cws->cpu = cpu; - cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); @@ -106,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) } /* Destruction of a cpu workqueue thread */ -static void probe_workqueue_destruction(struct task_struct *wq_thread) +static void +probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) { /* Workqueue only execute on one cpu */ int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -118,7 +130,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) list) { if (node->pid == wq_thread->pid) { list_del(&node->list); - kfree(node); + kref_put(&node->kref, cpu_workqueue_stat_free); goto found; } } @@ -137,9 +149,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { ret = list_entry(workqueue_cpu_stat(cpu)->list.next, struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); @@ -162,9 +176,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace) static void *workqueue_stat_next(void *prev, int idx) { struct cpu_workqueue_stats *prev_cws = prev; + struct cpu_workqueue_stats *ret; int cpu = prev_cws->cpu; unsigned long flags; - void *ret = NULL; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { @@ -175,11 +189,14 @@ static void *workqueue_stat_next(void *prev, int idx) return NULL; } while (!(ret = workqueue_stat_start_cpu(cpu))); return ret; + } else { + ret = list_entry(prev_cws->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, - list); + return ret; } static int workqueue_stat_show(struct seq_file *s, void *p) @@ -203,6 +220,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) return 0; } +static void workqueue_stat_release(void *stat) +{ + struct cpu_workqueue_stats *node = stat; + + kref_put(&node->kref, cpu_workqueue_stat_free); +} + static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); @@ -215,6 +239,7 @@ struct tracer_stat workqueue_stats __read_mostly = { .stat_start = workqueue_stat_start, .stat_next = workqueue_stat_next, .stat_show = workqueue_stat_show, + .stat_release = workqueue_stat_release, .stat_headers = workqueue_stat_headers }; @@ -238,35 +263,35 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; - ret = register_trace_workqueue_insertion(probe_workqueue_insertion); + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; - ret = register_trace_workqueue_execution(probe_workqueue_execution); + ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); if (ret) goto no_insertion; - ret = register_trace_workqueue_creation(probe_workqueue_creation); + ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); if (ret) goto no_execution; - ret = register_trace_workqueue_destruction(probe_workqueue_destruction); + ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); if (ret) goto no_creation; - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - return 0; no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation); + unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution); + unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion); + unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); out: pr_warning("trace_workqueue: unable to trace workqueues\n"); |
