aboutsummaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2011-01-13 15:06:28 +0900
committerPaul Mundt <lethal@linux-sh.org>2011-01-13 15:06:28 +0900
commitf43dc23d5ea91fca257be02138a255f02d98e806 (patch)
treeb29722f6e965316e90ac97abf79923ced250dc21 /kernel/trace
parentf8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff)
parent4162cf64973df51fc885825bc9ca4d055891c49f (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts: arch/sh/kernel/cpu/sh2/setup-sh7619.c arch/sh/kernel/cpu/sh2a/setup-mxg.c arch/sh/kernel/cpu/sh2a/setup-sh7201.c arch/sh/kernel/cpu/sh2a/setup-sh7203.c arch/sh/kernel/cpu/sh2a/setup-sh7206.c arch/sh/kernel/cpu/sh3/setup-sh7705.c arch/sh/kernel/cpu/sh3/setup-sh770x.c arch/sh/kernel/cpu/sh3/setup-sh7710.c arch/sh/kernel/cpu/sh3/setup-sh7720.c arch/sh/kernel/cpu/sh4/setup-sh4-202.c arch/sh/kernel/cpu/sh4/setup-sh7750.c arch/sh/kernel/cpu/sh4/setup-sh7760.c arch/sh/kernel/cpu/sh4a/setup-sh7343.c arch/sh/kernel/cpu/sh4a/setup-sh7366.c arch/sh/kernel/cpu/sh4a/setup-sh7722.c arch/sh/kernel/cpu/sh4a/setup-sh7723.c arch/sh/kernel/cpu/sh4a/setup-sh7724.c arch/sh/kernel/cpu/sh4a/setup-sh7763.c arch/sh/kernel/cpu/sh4a/setup-sh7770.c arch/sh/kernel/cpu/sh4a/setup-sh7780.c arch/sh/kernel/cpu/sh4a/setup-sh7785.c arch/sh/kernel/cpu/sh4a/setup-sh7786.c arch/sh/kernel/cpu/sh4a/setup-shx3.c arch/sh/kernel/cpu/sh5/setup-sh5.c drivers/serial/sh-sci.c drivers/serial/sh-sci.h include/linux/serial_sci.h
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig238
-rw-r--r--kernel/trace/Makefile14
-rw-r--r--kernel/trace/blktrace.c300
-rw-r--r--kernel/trace/ftrace.c1105
-rw-r--r--kernel/trace/kmemtrace.c468
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c1849
-rw-r--r--kernel/trace/ring_buffer_benchmark.c91
-rw-r--r--kernel/trace/trace.c1762
-rw-r--r--kernel/trace/trace.h583
-rw-r--r--kernel/trace/trace_boot.c179
-rw-r--r--kernel/trace/trace_branch.c35
-rw-r--r--kernel/trace/trace_clock.c38
-rw-r--r--kernel/trace/trace_entries.h276
-rw-r--r--kernel/trace/trace_event_perf.c216
-rw-r--r--kernel/trace/trace_event_profile.c39
-rw-r--r--kernel/trace/trace_event_types.h175
-rw-r--r--kernel/trace/trace_events.c803
-rw-r--r--kernel/trace/trace_events_filter.c611
-rw-r--r--kernel/trace/trace_export.c273
-rw-r--r--kernel/trace/trace_functions.c15
-rw-r--r--kernel/trace/trace_functions_graph.c824
-rw-r--r--kernel/trace/trace_hw_branches.c309
-rw-r--r--kernel/trace/trace_irqsoff.c279
-rw-r--r--kernel/trace/trace_kdb.c135
-rw-r--r--kernel/trace/trace_kprobe.c1847
-rw-r--r--kernel/trace/trace_mmiotrace.c17
-rw-r--r--kernel/trace/trace_output.c327
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_power.c214
-rw-r--r--kernel/trace/trace_printk.c29
-rw-r--r--kernel/trace/trace_sched_switch.c80
-rw-r--r--kernel/trace/trace_sched_wakeup.c355
-rw-r--r--kernel/trace/trace_selftest.c104
-rw-r--r--kernel/trace/trace_stack.c103
-rw-r--r--kernel/trace/trace_stat.c54
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c624
-rw-r--r--kernel/trace/trace_sysprof.c328
-rw-r--r--kernel/trace/trace_workqueue.c69
40 files changed, 9416 insertions, 5378 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e766..14674dce77a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,38 +11,48 @@ config NOP_TRACER
config HAVE_FTRACE_NMI_ENTER
bool
+ help
+ See Documentation/trace/ftrace-design.txt
config HAVE_FUNCTION_TRACER
bool
+ help
+ See Documentation/trace/ftrace-design.txt
config HAVE_FUNCTION_GRAPH_TRACER
bool
+ help
+ See Documentation/trace/ftrace-design.txt
config HAVE_FUNCTION_GRAPH_FP_TEST
bool
help
- An arch may pass in a unique value (frame pointer) to both the
- entering and exiting of a function. On exit, the value is compared
- and if it does not match, then it will panic the kernel.
+ See Documentation/trace/ftrace-design.txt
config HAVE_FUNCTION_TRACE_MCOUNT_TEST
bool
help
- This gets selected when the arch tests the function_trace_stop
- variable at the mcount call site. Otherwise, this variable
- is tested by the called function.
+ See Documentation/trace/ftrace-design.txt
config HAVE_DYNAMIC_FTRACE
bool
+ help
+ See Documentation/trace/ftrace-design.txt
config HAVE_FTRACE_MCOUNT_RECORD
bool
+ help
+ See Documentation/trace/ftrace-design.txt
-config HAVE_HW_BRANCH_TRACER
+config HAVE_SYSCALL_TRACEPOINTS
bool
+ help
+ See Documentation/trace/ftrace-design.txt
-config HAVE_FTRACE_SYSCALLS
+config HAVE_C_RECORDMCOUNT
bool
+ help
+ C version of recordmcount available?
config TRACER_MAX_TRACE
bool
@@ -59,16 +69,36 @@ config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
bool
+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool "Deprecated power event trace API, to be removed"
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
- select MARKERS
bool
+config RING_BUFFER_ALLOW_SWAP
+ bool
+ help
+ Allow the use of ring_buffer_swap_cpu.
+ Adds a very slight overhead to tracing when enabled.
+
# All tracer options should select GENERIC_TRACER. For those options that are
# enabled by all tracers (context switch and event tracer) they select TRACING.
# This allows those options to appear when no other tracer is selected. But the
# options do not appear when something else selects it. We need the two options
# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
-# hidding of the automatic options options.
+# hiding of the automatic options.
config TRACING
bool
@@ -104,21 +134,21 @@ menuconfig FTRACE
bool "Tracers"
default y if DEBUG_KERNEL
help
- Enable the kernel tracing infrastructure.
+ Enable the kernel tracing infrastructure.
if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- select FRAME_POINTER
+ select FRAME_POINTER if !ARM_UNWIND && !S390
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
- instruction to the beginning of every kernel function, which NOP
+ instruction at the beginning of every kernel function, which NOP
sequence is then dynamically patched into a tracer call when
tracing is enabled by the administrator. If it's runtime disabled
(the bootup default), then the overhead of the instructions is very
@@ -135,7 +165,7 @@ config FUNCTION_GRAPH_TRACER
and its entry.
Its first purpose is to trace the duration of functions and
draw a call graph for each thread with some information like
- the return value. This is done by setting the current return
+ the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
@@ -143,10 +173,11 @@ config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
- depends on GENERIC_TIME
+ depends on !ARCH_USES_GETTIMEOFFSET
select TRACE_IRQFLAGS
select GENERIC_TRACER
select TRACER_MAX_TRACE
+ select RING_BUFFER_ALLOW_SWAP
help
This option measures the time spent in irqs-off critical
sections, with microsecond accuracy.
@@ -157,19 +188,20 @@ config IRQSOFF_TRACER
echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
- (Note that kernel size and overhead increases with this option
+ (Note that kernel size and overhead increase with this option
enabled. This option and the preempt-off timing option can be
used together or separately.)
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
- depends on GENERIC_TIME
+ depends on !ARCH_USES_GETTIMEOFFSET
depends on PREEMPT
select GENERIC_TRACER
select TRACER_MAX_TRACE
+ select RING_BUFFER_ALLOW_SWAP
help
- This option measures the time spent in preemption off critical
+ This option measures the time spent in preemption-off critical
sections, with microsecond accuracy.
The default measurement method is a maximum search, which is
@@ -178,19 +210,10 @@ config PREEMPT_TRACER
echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
- (Note that kernel size and overhead increases with this option
+ (Note that kernel size and overhead increase with this option
enabled. This option and the irqs-off timing option can be
used together or separately.)
-config SYSPROF_TRACER
- bool "Sysprof Tracer"
- depends on X86
- select GENERIC_TRACER
- select CONTEXT_SWITCH_TRACER
- help
- This tracer provides the trace needed by the 'Sysprof' userspace
- tool.
-
config SCHED_TRACER
bool "Scheduling Latency Tracer"
select GENERIC_TRACER
@@ -205,35 +228,18 @@ config ENABLE_DEFAULT_TRACERS
depends on !GENERIC_TRACER
select TRACING
help
- This tracer hooks to various trace points in the kernel
+ This tracer hooks to various trace points in the kernel,
allowing the user to pick and choose which trace point they
want to trace. It also includes the sched_switch tracer plugin.
config FTRACE_SYSCALLS
bool "Trace syscalls"
- depends on HAVE_FTRACE_SYSCALLS
+ depends on HAVE_SYSCALL_TRACEPOINTS
select GENERIC_TRACER
select KALLSYMS
help
Basic tracer to catch the syscall entry and exit events.
-config BOOT_TRACER
- bool "Trace boot initcalls"
- select GENERIC_TRACER
- select CONTEXT_SWITCH_TRACER
- help
- This tracer helps developers to optimize boot times: it records
- the timings of the initcalls and traces key events and the identity
- of tasks that can cause boot delays, such as context-switches.
-
- Its aim is to be parsed by the /scripts/bootgraph.pl tool to
- produce pretty graphics about boot inefficiencies, giving a visual
- representation of the delays during initcalls - but the raw
- /debug/tracing/trace text output is readable too.
-
- You must pass in ftrace=initcall to the kernel command line
- to enable this on bootup.
-
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -248,19 +254,19 @@ choice
The likely/unlikely profiler only looks at the conditions that
are annotated with a likely or unlikely macro.
- The "all branch" profiler will profile every if statement in the
+ The "all branch" profiler will profile every if-statement in the
kernel. This profiler will also enable the likely/unlikely
- profiler as well.
+ profiler.
- Either of the above profilers add a bit of overhead to the system.
- If unsure choose "No branch profiling".
+ Either of the above profilers adds a bit of overhead to the system.
+ If unsure, choose "No branch profiling".
config BRANCH_PROFILE_NONE
bool "No branch profiling"
help
- No branch profiling. Branch profiling adds a bit of overhead.
- Only enable it if you want to analyse the branching behavior.
- Otherwise keep it disabled.
+ No branch profiling. Branch profiling adds a bit of overhead.
+ Only enable it if you want to analyse the branching behavior.
+ Otherwise keep it disabled.
config PROFILE_ANNOTATED_BRANCHES
bool "Trace likely/unlikely profiler"
@@ -271,7 +277,7 @@ config PROFILE_ANNOTATED_BRANCHES
/sys/kernel/debug/tracing/profile_annotated_branch
- Note: this will add a significant overhead, only turn this
+ Note: this will add a significant overhead; only turn this
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
@@ -288,7 +294,7 @@ config PROFILE_ALL_BRANCHES
This configuration, when enabled, will impose a great overhead
on the system. This should only be enabled when the system
- is to be analyzed
+ is to be analyzed in much detail.
endchoice
config TRACING_BRANCHES
@@ -313,16 +319,6 @@ config BRANCH_TRACER
Say N if unsure.
-config POWER_TRACER
- bool "Trace power consumption behavior"
- depends on X86
- select GENERIC_TRACER
- help
- This tracer helps developers to analyze and optimize the kernels
- power management decisions, specifically the C-state and P-state
- behavior.
-
-
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
@@ -347,47 +343,8 @@ config STACK_TRACER
Say N if unsure.
-config HW_BRANCH_TRACER
- depends on HAVE_HW_BRANCH_TRACER
- bool "Trace hw branches"
- select GENERIC_TRACER
- help
- This tracer records all branches on the system in a circular
- buffer giving access to the last N branches for each cpu.
-
-config KMEMTRACE
- bool "Trace SLAB allocations"
- select GENERIC_TRACER
- help
- kmemtrace provides tracing for slab allocator functions, such as
- kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
- data is then fed to the userspace application in order to analyse
- allocation hotspots, internal fragmentation and so on, making it
- possible to see how well an allocator performs, as well as debug
- and profile kernel code.
-
- This requires an userspace application to use. See
- Documentation/trace/kmemtrace.txt for more information.
-
- Saying Y will make the kernel somewhat larger and slower. However,
- if you disable kmemtrace at run-time or boot-time, the performance
- impact is minimal (depending on the arch the kernel is built for).
-
- If unsure, say N.
-
-config WORKQUEUE_TRACER
- bool "Trace workqueues"
- select GENERIC_TRACER
- help
- The workqueue tracer provides some statistical informations
- about each cpu workqueue thread such as the number of the
- works inserted and executed since their creation. It can help
- to evaluate the amount of work each of them have to perform.
- For example it can help a developer to decide whether he should
- choose a per cpu workqueue instead of a singlethreaded one.
-
config BLK_DEV_IO_TRACE
- bool "Support for tracing block io actions"
+ bool "Support for tracing block IO actions"
depends on SYSFS
depends on BLOCK
select RELAY
@@ -411,38 +368,55 @@ config BLK_DEV_IO_TRACE
If unsure, say N.
+config KPROBE_EVENT
+ depends on KPROBES
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ bool "Enable kprobes-based dynamic events"
+ select TRACING
+ default y
+ help
+ This allows the user to add tracing events (similar to tracepoints)
+ on the fly via the ftrace interface. See
+ Documentation/trace/kprobetrace.txt for more details.
+
+ Those events can be inserted wherever kprobes can probe, and record
+ various register and memory values.
+
+ This option is also required by perf-probe subcommand of perf tools.
+ If you want to use perf tools, this option is strongly recommended.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
help
- This option will modify all the calls to ftrace dynamically
- (will patch them out of the binary image and replaces them
- with a No-Op instruction) as they are called. A table is
- created to dynamically enable them again.
+ This option will modify all the calls to ftrace dynamically
+ (will patch them out of the binary image and replace them
+ with a No-Op instruction) as they are called. A table is
+ created to dynamically enable them again.
- This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
- has native performance as long as no tracing is active.
+ This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
+ otherwise has native performance as long as no tracing is active.
- The changes to the code are done by a kernel thread that
- wakes up once a second and checks to see if any ftrace calls
- were made. If so, it runs stop_machine (stops all CPUS)
- and modifies the code to jump over the call to ftrace.
+ The changes to the code are done by a kernel thread that
+ wakes up once a second and checks to see if any ftrace calls
+ were made. If so, it runs stop_machine (stops all CPUS)
+ and modifies the code to jump over the call to ftrace.
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
default n
help
- This option enables the kernel function profiler. A file is created
- in debugfs called function_profile_enabled which defaults to zero.
- When a 1 is echoed into this file profiling begins, and when a
- zero is entered, profiling stops. A file in the trace_stats
- directory called functions, that show the list of functions that
- have been hit and their counters.
+ This option enables the kernel function profiler. A file is created
+ in debugfs called function_profile_enabled which defaults to zero.
+ When a 1 is echoed into this file profiling begins, and when a
+ zero is entered, profiling stops. A "functions" file is created in
+ the trace_stats directory; this file shows the list of functions that
+ have been hit and their counters.
- If in doubt, say N
+ If in doubt, say N.
config FTRACE_MCOUNT_RECORD
def_bool y
@@ -462,6 +436,18 @@ config FTRACE_STARTUP_TEST
functioning properly. It will do tests on all the configured
tracers of ftrace.
+config EVENT_TRACE_TEST_SYSCALLS
+ bool "Run selftest on syscall events"
+ depends on FTRACE_STARTUP_TEST
+ help
+ This option will also enable testing every syscall event.
+ It only enables the event and disables it and runs various loads
+ with the event enabled. This adds a bit more time for kernel boot
+ up since it runs this on every system call defined.
+
+ TBD - enable a way to actually call the syscalls as we test their
+ events
+
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && PCI
@@ -489,8 +475,8 @@ config RING_BUFFER_BENCHMARK
tristate "Ring buffer benchmark stress tester"
depends on RING_BUFFER
help
- This option creates a test to stress the ring buffer and bench mark it.
- It creates its own ring buffer such that it will not interfer with
+ This option creates a test to stress the ring buffer and benchmark it.
+ It creates its own ring buffer such that it will not interfere with
any other users of the ring buffer (such as ftrace). It then creates
a producer and consumer that will run for 10 seconds and sleep for
10 seconds. Each interval it will print out the number of events
@@ -499,7 +485,7 @@ config RING_BUFFER_BENCHMARK
It does not disable interrupts or raise its priority, so it may be
affected by processes that are running.
- If unsure, say N
+ If unsure, say N.
endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90..761c510a06c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
-obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
@@ -38,12 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
obj-$(CONFIG_NOP_TRACER) += trace_nop.o
obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
-obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_POWER_TRACER) += trace_power.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
@@ -52,7 +47,14 @@ endif
obj-$(CONFIG_EVENT_TRACING) += trace_events.o
obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
-obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
+endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc3..7b8ec028154 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/time.h>
#include <linux/uaccess.h>
@@ -64,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
+ struct ring_buffer *buffer = NULL;
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
if (blk_tracer) {
+ buffer = blk_tr->buffer;
pc = preempt_count();
- event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+ event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len,
0, pc);
if (!event)
@@ -95,7 +98,7 @@ record_it:
memcpy((void *) t + sizeof(*t), data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, event, 0, pc);
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
}
}
@@ -165,9 +168,11 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
+#define BLK_TC_RAHEAD BLK_TC_AHEAD
+
/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
- (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/*
* The worker for the various blk_add_trace*() types. Fills out a
@@ -178,6 +183,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
+ struct ring_buffer *buffer = NULL;
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
@@ -189,9 +195,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
return;
what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, BARRIER);
- what |= MASK_TC_BIT(rw, SYNCIO);
- what |= MASK_TC_BIT(rw, AHEAD);
+ what |= MASK_TC_BIT(rw, SYNC);
+ what |= MASK_TC_BIT(rw, RAHEAD);
what |= MASK_TC_BIT(rw, META);
what |= MASK_TC_BIT(rw, DISCARD);
@@ -203,8 +208,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (blk_tracer) {
tracing_record_cmdline(current);
+ buffer = blk_tr->buffer;
pc = preempt_count();
- event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+ event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len,
0, pc);
if (!event)
@@ -251,7 +257,7 @@ record_it:
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, event, 0, pc);
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
return;
}
}
@@ -266,8 +272,8 @@ static void blk_trace_free(struct blk_trace *bt)
{
debugfs_remove(bt->msg_file);
debugfs_remove(bt->dropped_file);
- debugfs_remove(bt->dir);
relay_close(bt->rchan);
+ debugfs_remove(bt->dir);
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -317,6 +323,7 @@ static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
.open = blk_dropped_open,
.read = blk_dropped_read,
+ .llseek = default_llseek,
};
static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -356,6 +363,7 @@ static const struct file_operations blk_msg_fops = {
.owner = THIS_MODULE,
.open = blk_msg_open,
.write = blk_msg_write,
+ .llseek = noop_llseek,
};
/*
@@ -377,18 +385,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
- struct dentry *parent = dentry->d_parent;
debugfs_remove(dentry);
- /*
- * this will fail for all but the last file, but that is ok. what we
- * care about is the top level buts->name directory going away, when
- * the last trace file is gone. Then we don't have to rmdir() that
- * manually on trace stop, so it nicely solves the issue with
- * force killing of running traces.
- */
-
- debugfs_remove(parent);
return 0;
}
@@ -545,13 +543,49 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (ret)
return ret;
- if (copy_to_user(arg, &buts, sizeof(buts)))
+ if (copy_to_user(arg, &buts, sizeof(buts))) {
+ blk_trace_remove(q);
return -EFAULT;
-
+ }
return 0;
}
EXPORT_SYMBOL_GPL(blk_trace_setup);
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+ dev_t dev, struct block_device *bdev,
+ char __user *arg)
+{
+ struct blk_user_trace_setup buts;
+ struct compat_blk_user_trace_setup cbuts;
+ int ret;
+
+ if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+ return -EFAULT;
+
+ buts = (struct blk_user_trace_setup) {
+ .act_mask = cbuts.act_mask,
+ .buf_size = cbuts.buf_size,
+ .buf_nr = cbuts.buf_nr,
+ .start_lba = cbuts.start_lba,
+ .end_lba = cbuts.end_lba,
+ .pid = cbuts.pid,
+ };
+ memcpy(&buts.name, &cbuts.name, 32);
+
+ ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, &buts.name, 32)) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+#endif
+
int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
@@ -611,6 +645,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
bdevname(bdev, b);
ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+ case BLKTRACESETUP32:
+ bdevname(bdev, b);
+ ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ break;
+#endif
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
@@ -664,10 +704,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (likely(!bt))
return;
- if (blk_discard_rq(rq))
- rw |= (1 << BIO_RW_DISCARD);
+ if (rq->cmd_flags & REQ_DISCARD)
+ rw |= REQ_DISCARD;
+
+ if (rq->cmd_flags & REQ_SECURE)
+ rw |= REQ_SECURE;
- if (blk_pc_request(rq)) {
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
what, rq->errors, rq->cmd_len, rq->cmd);
@@ -678,28 +721,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
}
}
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ABORT);
}
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_INSERT);
}
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
}
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -727,34 +775,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
}
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
}
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
}
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
}
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
}
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
}
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -768,7 +822,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
}
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -782,7 +837,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
}
}
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -790,7 +845,7 @@ static void blk_add_trace_plug(struct request_queue *q)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -803,7 +858,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
}
}
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -816,7 +871,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
}
}
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+ struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
@@ -832,6 +888,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
/**
* blk_add_trace_remap - Add a trace for a remap operation
+ * @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @bio: the source bio
* @dev: target device
@@ -842,8 +899,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
* it spans a stripe (or similar). Add a trace for that action.
*
**/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -861,6 +919,39 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
}
/**
+ * blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore: trace callback data parameter (not used)
+ * @q: queue the io is for
+ * @rq: the source request
+ * @dev: target device
+ * @from: source sector
+ *
+ * Description:
+ * Device mapper remaps request to other devices.
+ * Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_rq_remap(void *ignore,
+ struct request_queue *q,
+ struct request *rq, dev_t dev,
+ sector_t from)
+{
+ struct blk_trace *bt = q->blk_trace;
+ struct blk_io_trace_remap r;
+
+ if (likely(!bt))
+ return;
+
+ r.device_from = cpu_to_be32(dev);
+ r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
+ r.sector_from = cpu_to_be64(from);
+
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+ sizeof(r), &r);
+}
+
+/**
* blk_add_driver_data - Add binary message with driver-specific data
* @q: queue the io is for
* @rq: io request
@@ -880,7 +971,7 @@ void blk_add_driver_data(struct request_queue *q,
if (likely(!bt))
return;
- if (blk_pc_request(rq))
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
else
@@ -893,61 +984,64 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+ ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_getrq(blk_add_trace_getrq);
+ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+ ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
- ret = register_trace_block_plug(blk_add_trace_plug);
+ ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+ ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+ ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
- ret = register_trace_block_split(blk_add_trace_split);
+ ret = register_trace_block_remap(blk_add_trace_remap, NULL);
WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap);
+ ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
}
static void blk_unregister_tracepoints(void)
{
- unregister_trace_block_remap(blk_add_trace_remap);
- unregister_trace_block_split(blk_add_trace_split);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
- unregister_trace_block_plug(blk_add_trace_plug);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
- unregister_trace_block_getrq(blk_add_trace_getrq);
- unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
- unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
- unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
- unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
- unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
- unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
- unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
- unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+ unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+ unregister_trace_block_remap(blk_add_trace_remap, NULL);
+ unregister_trace_block_split(blk_add_trace_split, NULL);
+ unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+ unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+ unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+ unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+ unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+ unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+ unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+ unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+ unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
@@ -1290,7 +1384,7 @@ out:
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return print_one_line(iter, false);
}
@@ -1312,7 +1406,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
}
static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return blk_trace_synthesize_old_trace(iter) ?
TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1350,12 +1445,16 @@ static struct tracer blk_tracer __read_mostly = {
.set_flag = blk_tracer_set_flag,
};
-static struct trace_event trace_blk_event = {
- .type = TRACE_BLK,
+static struct trace_event_functions trace_blk_event_funcs = {
.trace = blk_trace_event_print,
.binary = blk_trace_event_print_binary,
};
+static struct trace_event trace_blk_event = {
+ .type = TRACE_BLK,
+ .funcs = &trace_blk_event_funcs,
+};
+
static int __init init_blk_tracer(void)
{
if (!register_ftrace_event(&trace_blk_event)) {
@@ -1550,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct block_device *bdev;
ssize_t ret = -ENXIO;
- lock_kernel();
bdev = bdget(part_devt(p));
if (bdev == NULL)
- goto out_unlock_kernel;
+ goto out;
q = blk_trace_get_queue(bdev);
if (q == NULL)
@@ -1581,8 +1679,7 @@ out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
out_bdput:
bdput(bdev);
-out_unlock_kernel:
- unlock_kernel();
+out:
return ret;
}
@@ -1612,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
ret = -ENXIO;
- lock_kernel();
p = dev_to_part(dev);
bdev = bdget(part_devt(p));
if (bdev == NULL)
- goto out_unlock_kernel;
+ goto out;
q = blk_trace_get_queue(bdev);
if (q == NULL)
@@ -1651,8 +1747,6 @@ out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
out_bdput:
bdput(bdev);
-out_unlock_kernel:
- unlock_kernel();
out:
return ret ? ret : count;
}
@@ -1662,6 +1756,11 @@ int blk_trace_init_sysfs(struct device *dev)
return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
}
+void blk_trace_remove_sysfs(struct device *dev)
+{
+ sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
+}
+
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
@@ -1672,7 +1771,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
int len = rq->cmd_len;
unsigned char *cmd = rq->cmd;
- if (!blk_pc_request(rq)) {
+ if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
buf[0] = '\0';
return;
}
@@ -1697,21 +1796,21 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
if (rw & WRITE)
rwbs[i++] = 'W';
- else if (rw & 1 << BIO_RW_DISCARD)
+ else if (rw & REQ_DISCARD)
rwbs[i++] = 'D';
else if (bytes)
rwbs[i++] = 'R';
else
rwbs[i++] = 'N';
- if (rw & 1 << BIO_RW_AHEAD)
+ if (rw & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (rw & 1 << BIO_RW_BARRIER)
- rwbs[i++] = 'B';
- if (rw & 1 << BIO_RW_SYNCIO)
+ if (rw & REQ_SYNC)
rwbs[i++] = 'S';
- if (rw & 1 << BIO_RW_META)
+ if (rw & REQ_META)
rwbs[i++] = 'M';
+ if (rw & REQ_SECURE)
+ rwbs[i++] = 'E';
rwbs[i] = '\0';
}
@@ -1721,8 +1820,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
int rw = rq->cmd_flags & 0x03;
int bytes;
- if (blk_discard_rq(rq))
- rw |= (1 << BIO_RW_DISCARD);
+ if (rq->cmd_flags & REQ_DISCARD)
+ rw |= REQ_DISCARD;
+
+ if (rq->cmd_flags & REQ_SECURE)
+ rw |= REQ_SECURE;
bytes = blk_rq_bytes(rq);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3718d55fb4c..f3dadae8388 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,13 @@
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
-#include <linux/kprobes.h>
#include <linux/ftrace.h>
#include <linux/sysctl.h>
+#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/hash.h>
+#include <linux/rcupdate.h>
#include <trace/events/sched.h>
@@ -60,6 +61,13 @@ static int last_ftrace_enabled;
/* Quick disabling of function tracer. */
int function_trace_stop;
+/* List for set_ftrace_pid's pids. */
+LIST_HEAD(ftrace_pids);
+struct ftrace_pid {
+ struct list_head list;
+ struct pid *pid;
+};
+
/*
* ftrace_disabled is set when an anomaly is discovered.
* ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,18 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+/*
+ * Traverse the ftrace_list, invoking all entries. The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism. The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
{
- struct ftrace_ops *op = ftrace_list;
-
- /* in case someone actually ports this to alpha! */
- read_barrier_depends();
+ struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
while (op != &ftrace_list_end) {
- /* silly alpha */
- read_barrier_depends();
op->func(ip, parent_ip);
- op = op->next;
+ op = rcu_dereference_raw(op->next); /*see above*/
};
}
@@ -144,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
* the ops->next pointer is valid before another CPU sees
* the ops pointer included into the ftrace_list.
*/
- smp_wmb();
- ftrace_list = ops;
+ rcu_assign_pointer(ftrace_list, ops);
if (ftrace_enabled) {
ftrace_func_t func;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
else
func = ftrace_list_func;
- if (ftrace_pid_trace) {
+ if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
}
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_list->next == &ftrace_list_end) {
ftrace_func_t func = ftrace_list->func;
- if (ftrace_pid_trace) {
+ if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
}
@@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void)
if (ftrace_trace_function == ftrace_stub)
return;
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
func = ftrace_trace_function;
+#else
+ func = __ftrace_trace_function;
+#endif
- if (ftrace_pid_trace) {
+ if (!list_empty(&ftrace_pids)) {
set_ftrace_pid_function(func);
func = ftrace_pid_func;
} else {
@@ -249,6 +264,7 @@ struct ftrace_profile {
unsigned long counter;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
unsigned long long time;
+ unsigned long long time_squared;
#endif
};
@@ -291,7 +307,9 @@ function_stat_next(void *v, int idx)
pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
again:
- rec++;
+ if (idx != 0)
+ rec++;
+
if ((void *)rec >= (void *)&pg->records[pg->index]) {
pg = pg->next;
if (!pg)
@@ -349,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_printf(m, " Function "
- "Hit Time Avg\n"
+ "Hit Time Avg s^2\n"
" -------- "
- "--- ---- ---\n");
+ "--- ---- --- ---\n");
#else
seq_printf(m, " Function Hit\n"
" -------- ---\n");
@@ -363,11 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
{
struct ftrace_profile *rec = v;
char str[KSYM_SYMBOL_LEN];
+ int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- static DEFINE_MUTEX(mutex);
static struct trace_seq s;
unsigned long long avg;
+ unsigned long long stddev;
#endif
+ mutex_lock(&ftrace_profile_lock);
+
+ /* we raced with function_profile_reset() */
+ if (unlikely(rec->counter == 0)) {
+ ret = -EBUSY;
+ goto out;
+ }
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -377,17 +403,31 @@ static int function_stat_show(struct seq_file *m, void *v)
avg = rec->time;
do_div(avg, rec->counter);
- mutex_lock(&mutex);
+ /* Sample standard deviation (s^2) */
+ if (rec->counter <= 1)
+ stddev = 0;
+ else {
+ stddev = rec->time_squared - rec->counter * avg * avg;
+ /*
+ * Divide only 1000 for ns^2 -> us^2 conversion.
+ * trace_print_graph_duration will divide 1000 again.
+ */
+ do_div(stddev, (rec->counter - 1) * 1000);
+ }
+
trace_seq_init(&s);
trace_print_graph_duration(rec->time, &s);
trace_seq_puts(&s, " ");
trace_print_graph_duration(avg, &s);
+ trace_seq_puts(&s, " ");
+ trace_print_graph_duration(stddev, &s);
trace_print_seq(m, &s);
- mutex_unlock(&mutex);
#endif
seq_putc(m, '\n');
+out:
+ mutex_unlock(&ftrace_profile_lock);
- return 0;
+ return ret;
}
static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -633,6 +673,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
if (!stat->hash || !ftrace_profile_enabled)
goto out;
+ /* If the calltime was zero'd ignore it */
+ if (!trace->calltime)
+ goto out;
+
calltime = trace->rettime - trace->calltime;
if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -651,8 +695,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
}
rec = ftrace_find_profiled_func(stat, trace->func);
- if (rec)
+ if (rec) {
rec->time += calltime;
+ rec->time_squared += calltime * calltime;
+ }
out:
local_irq_restore(flags);
@@ -734,7 +780,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
out:
mutex_unlock(&ftrace_profile_lock);
- filp->f_pos += cnt;
+ *ppos += cnt;
return cnt;
}
@@ -754,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
.open = tracing_open_generic,
.read = ftrace_profile_read,
.write = ftrace_profile_write,
+ .llseek = default_llseek,
};
/* used to initialize the real stat files */
@@ -766,7 +813,7 @@ static struct tracer_stat function_stats __initdata = {
.stat_show = function_stat_show
};
-static void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
{
struct ftrace_profile_stat *stat;
struct dentry *entry;
@@ -784,7 +831,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
* The files created are permanent, if something happens
* we still do not free memory.
*/
- kfree(stat);
WARN(1,
"Could not allocate stat file for cpu %d\n",
cpu);
@@ -811,13 +857,11 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
}
#else /* CONFIG_FUNCTION_PROFILER */
-static void ftrace_profile_debugfs(struct dentry *d_tracer)
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
{
}
#endif /* CONFIG_FUNCTION_PROFILER */
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -841,10 +885,8 @@ enum {
FTRACE_ENABLE_CALLS = (1 << 0),
FTRACE_DISABLE_CALLS = (1 << 1),
FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
- FTRACE_ENABLE_MCOUNT = (1 << 3),
- FTRACE_DISABLE_MCOUNT = (1 << 4),
- FTRACE_START_FUNC_RET = (1 << 5),
- FTRACE_STOP_FUNC_RET = (1 << 6),
+ FTRACE_START_FUNC_RET = (1 << 3),
+ FTRACE_STOP_FUNC_RET = (1 << 4),
};
static int ftrace_filtered;
@@ -884,36 +926,6 @@ static struct dyn_ftrace *ftrace_free_records;
} \
}
-#ifdef CONFIG_KPROBES
-
-static int frozen_record_count;
-
-static inline void freeze_record(struct dyn_ftrace *rec)
-{
- if (!(rec->flags & FTRACE_FL_FROZEN)) {
- rec->flags |= FTRACE_FL_FROZEN;
- frozen_record_count++;
- }
-}
-
-static inline void unfreeze_record(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_FROZEN) {
- rec->flags &= ~FTRACE_FL_FROZEN;
- frozen_record_count--;
- }
-}
-
-static inline int record_frozen(struct dyn_ftrace *rec)
-{
- return rec->flags & FTRACE_FL_FROZEN;
-}
-#else
-# define freeze_record(rec) ({ 0; })
-# define unfreeze_record(rec) ({ 0; })
-# define record_frozen(rec) ({ 0; })
-#endif /* CONFIG_KPROBES */
-
static void ftrace_free_rec(struct dyn_ftrace *rec)
{
rec->freelist = ftrace_free_records;
@@ -1011,75 +1023,54 @@ static void ftrace_bug(int failed, unsigned long ip)
}
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec->ip <= (unsigned long)end &&
+ rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+ return 1;
+ } while_for_each_ftrace_rec();
+ return 0;
+}
+
+
static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
- unsigned long ip, fl;
+ unsigned long flag = 0UL;
ftrace_addr = (unsigned long)FTRACE_ADDR;
- ip = rec->ip;
-
/*
- * If this record is not to be traced and
- * it is not enabled then do nothing.
+ * If this record is not to be traced or we want to disable it,
+ * then disable it.
*
- * If this record is not to be traced and
- * it is enabled then disable it.
+ * If we want to enable it and filtering is off, then enable it.
*
+ * If we want to enable it and filtering is on, enable it only if
+ * it's filtered
*/
- if (rec->flags & FTRACE_FL_NOTRACE) {
- if (rec->flags & FTRACE_FL_ENABLED)
- rec->flags &= ~FTRACE_FL_ENABLED;
- else
- return 0;
-
- } else if (ftrace_filtered && enable) {
- /*
- * Filtering is on:
- */
-
- fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
-
- /* Record is filtered and enabled, do nothing */
- if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
- return 0;
-
- /* Record is not filtered or enabled, do nothing */
- if (!fl)
- return 0;
-
- /* Record is not filtered but enabled, disable it */
- if (fl == FTRACE_FL_ENABLED)
- rec->flags &= ~FTRACE_FL_ENABLED;
- else
- /* Otherwise record is filtered but not enabled, enable it */
- rec->flags |= FTRACE_FL_ENABLED;
- } else {
- /* Disable or not filtered */
-
- if (enable) {
- /* if record is enabled, do nothing */
- if (rec->flags & FTRACE_FL_ENABLED)
- return 0;
-
- rec->flags |= FTRACE_FL_ENABLED;
-
- } else {
+ if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
+ if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
+ flag = FTRACE_FL_ENABLED;
+ }
- /* if record is not enabled, do nothing */
- if (!(rec->flags & FTRACE_FL_ENABLED))
- return 0;
+ /* If the state of this record hasn't changed, then do nothing */
+ if ((rec->flags & FTRACE_FL_ENABLED) == flag)
+ return 0;
- rec->flags &= ~FTRACE_FL_ENABLED;
- }
+ if (flag) {
+ rec->flags |= FTRACE_FL_ENABLED;
+ return ftrace_make_call(rec, ftrace_addr);
}
- if (rec->flags & FTRACE_FL_ENABLED)
- return ftrace_make_call(rec, ftrace_addr);
- else
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
}
static void ftrace_replace_code(int enable)
@@ -1098,25 +1089,12 @@ static void ftrace_replace_code(int enable)
!(rec->flags & FTRACE_FL_CONVERTED))
continue;
- /* ignore updates to this record's mcount site */
- if (get_kprobe((void *)rec->ip)) {
- freeze_record(rec);
- continue;
- } else {
- unfreeze_record(rec);
- }
-
failed = __ftrace_replace_code(rec, enable);
if (failed) {
rec->flags |= FTRACE_FL_FAILED;
- if ((system_state == SYSTEM_BOOTING) ||
- !core_kernel_text(rec->ip)) {
- ftrace_free_rec(rec);
- } else {
- ftrace_bug(failed, rec->ip);
- /* Stop processing */
- return;
- }
+ ftrace_bug(failed, rec->ip);
+ /* Stop processing */
+ return;
}
} while_for_each_ftrace_rec();
}
@@ -1247,8 +1225,6 @@ static void ftrace_shutdown(int command)
static void ftrace_startup_sysctl(void)
{
- int command = FTRACE_ENABLE_MCOUNT;
-
if (unlikely(ftrace_disabled))
return;
@@ -1256,23 +1232,17 @@ static void ftrace_startup_sysctl(void)
saved_ftrace_func = NULL;
/* ftrace_start_up is true if we want ftrace running */
if (ftrace_start_up)
- command |= FTRACE_ENABLE_CALLS;
-
- ftrace_run_update_code(command);
+ ftrace_run_update_code(FTRACE_ENABLE_CALLS);
}
static void ftrace_shutdown_sysctl(void)
{
- int command = FTRACE_DISABLE_MCOUNT;
-
if (unlikely(ftrace_disabled))
return;
/* ftrace_start_up is true if ftrace is running */
if (ftrace_start_up)
- command |= FTRACE_DISABLE_CALLS;
-
- ftrace_run_update_code(command);
+ ftrace_run_update_code(FTRACE_DISABLE_CALLS);
}
static cycle_t ftrace_update_time;
@@ -1297,12 +1267,34 @@ static int ftrace_update_code(struct module *mod)
ftrace_new_addrs = p->newlist;
p->flags = 0L;
- /* convert record (i.e, patch mcount-call with NOP) */
- if (ftrace_code_disable(mod, p)) {
- p->flags |= FTRACE_FL_CONVERTED;
- ftrace_update_cnt++;
- } else
+ /*
+ * Do the initial record convertion from mcount jump
+ * to the NOP instructions.
+ */
+ if (!ftrace_code_disable(mod, p)) {
ftrace_free_rec(p);
+ continue;
+ }
+
+ p->flags |= FTRACE_FL_CONVERTED;
+ ftrace_update_cnt++;
+
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ */
+ if (ftrace_start_up) {
+ int failed = __ftrace_replace_code(p, 1);
+ if (failed) {
+ ftrace_bug(failed, p->ip);
+ ftrace_free_rec(p);
+ }
+ }
}
stop = ftrace_now(raw_smp_processor_id());
@@ -1358,36 +1350,38 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
enum {
FTRACE_ITER_FILTER = (1 << 0),
- FTRACE_ITER_CONT = (1 << 1),
- FTRACE_ITER_NOTRACE = (1 << 2),
- FTRACE_ITER_FAILURES = (1 << 3),
- FTRACE_ITER_PRINTALL = (1 << 4),
- FTRACE_ITER_HASH = (1 << 5),
+ FTRACE_ITER_NOTRACE = (1 << 1),
+ FTRACE_ITER_FAILURES = (1 << 2),
+ FTRACE_ITER_PRINTALL = (1 << 3),
+ FTRACE_ITER_HASH = (1 << 4),
};
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
struct ftrace_iterator {
- struct ftrace_page *pg;
- int hidx;
- int idx;
- unsigned flags;
- unsigned char buffer[FTRACE_BUFF_MAX+1];
- unsigned buffer_idx;
- unsigned filtered;
+ loff_t pos;
+ loff_t func_pos;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *func;
+ struct ftrace_func_probe *probe;
+ struct trace_parser parser;
+ int hidx;
+ int idx;
+ unsigned flags;
};
static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct hlist_node *hnd = v;
+ struct hlist_node *hnd = NULL;
struct hlist_head *hhd;
- WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
-
(*pos)++;
+ iter->pos = *pos;
+ if (iter->probe)
+ hnd = &iter->probe->node;
retry:
if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
return NULL;
@@ -1410,35 +1404,51 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
}
}
- return hnd;
+ if (WARN_ON_ONCE(!hnd))
+ return NULL;
+
+ iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+ return iter;
}
static void *t_hash_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
+ loff_t l;
+ if (iter->func_pos > *pos)
+ return NULL;
+
+ iter->hidx = 0;
+ for (l = 0; l <= (*pos - iter->func_pos); ) {
+ p = t_hash_next(m, &l);
+ if (!p)
+ break;
+ }
+ if (!p)
+ return NULL;
+
+ /* Only set this if we have an item */
iter->flags |= FTRACE_ITER_HASH;
- return t_hash_next(m, p, pos);
+ return iter;
}
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
{
struct ftrace_func_probe *rec;
- struct hlist_node *hnd = v;
- char str[KSYM_SYMBOL_LEN];
- rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+ rec = iter->probe;
+ if (WARN_ON_ONCE(!rec))
+ return -EIO;
if (rec->ops->print)
return rec->ops->print(m, rec->ip, rec->ops, rec->data);
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
- seq_printf(m, "%s:", str);
-
- kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
- seq_printf(m, "%s", str);
+ seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
if (rec->data)
seq_printf(m, ":%p", rec->data);
@@ -1454,12 +1464,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
struct dyn_ftrace *rec = NULL;
if (iter->flags & FTRACE_ITER_HASH)
- return t_hash_next(m, v, pos);
+ return t_hash_next(m, pos);
(*pos)++;
+ iter->pos = *pos;
if (iter->flags & FTRACE_ITER_PRINTALL)
- return NULL;
+ return t_hash_start(m, pos);
retry:
if (iter->idx >= iter->pg->index) {
@@ -1467,8 +1478,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
iter->pg = iter->pg->next;
iter->idx = 0;
goto retry;
- } else {
- iter->idx = -1;
}
} else {
rec = &iter->pg->records[iter->idx++];
@@ -1490,16 +1499,36 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
}
- return rec;
+ if (!rec)
+ return t_hash_start(m, pos);
+
+ iter->func_pos = *pos;
+ iter->func = rec;
+
+ return iter;
+}
+
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+ iter->pos = 0;
+ iter->func_pos = 0;
+ iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
+ loff_t l;
mutex_lock(&ftrace_lock);
/*
+ * If an lseek was done, then reset and start from beginning.
+ */
+ if (*pos < iter->pos)
+ reset_iter_read(iter);
+
+ /*
* For set_ftrace_filter reading, if we have the filter
* off, we can short cut and just print out that all
* functions are enabled.
@@ -1508,26 +1537,35 @@ static void *t_start(struct seq_file *m, loff_t *pos)
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
- (*pos)++;
+ /* reset in case of seek/pread */
+ iter->flags &= ~FTRACE_ITER_HASH;
return iter;
}
if (iter->flags & FTRACE_ITER_HASH)
return t_hash_start(m, pos);
- if (*pos > 0) {
- if (iter->idx < 0)
- return p;
- (*pos)--;
- iter->idx--;
+ /*
+ * Unfortunately, we need to restart at ftrace_pages_start
+ * every time we let go of the ftrace_mutex. This is because
+ * those pointers can change without the lock.
+ */
+ iter->pg = ftrace_pages_start;
+ iter->idx = 0;
+ for (l = 0; l <= *pos; ) {
+ p = t_next(m, p, &l);
+ if (!p)
+ break;
}
- p = t_next(m, p, pos);
+ if (!p) {
+ if (iter->flags & FTRACE_ITER_FILTER)
+ return t_hash_start(m, pos);
- if (!p)
- return t_hash_start(m, pos);
+ return NULL;
+ }
- return p;
+ return iter;
}
static void t_stop(struct seq_file *m, void *p)
@@ -1538,28 +1576,27 @@ static void t_stop(struct seq_file *m, void *p)
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
- struct dyn_ftrace *rec = v;
- char str[KSYM_SYMBOL_LEN];
+ struct dyn_ftrace *rec;
if (iter->flags & FTRACE_ITER_HASH)
- return t_hash_show(m, v);
+ return t_hash_show(m, iter);
if (iter->flags & FTRACE_ITER_PRINTALL) {
seq_printf(m, "#### all functions enabled ####\n");
return 0;
}
+ rec = iter->func;
+
if (!rec)
return 0;
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%ps\n", (void *)rec->ip);
return 0;
}
-static struct seq_operations show_ftrace_seq_ops = {
+static const struct seq_operations show_ftrace_seq_ops = {
.start = t_start,
.next = t_next,
.stop = t_stop,
@@ -1593,17 +1630,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return ret;
}
-int ftrace_avail_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- struct ftrace_iterator *iter = m->private;
-
- seq_release(inode, file);
- kfree(iter);
-
- return 0;
-}
-
static int
ftrace_failures_open(struct inode *inode, struct file *file)
{
@@ -1613,8 +1639,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
ret = ftrace_avail_open(inode, file);
if (!ret) {
- m = (struct seq_file *)file->private_data;
- iter = (struct ftrace_iterator *)m->private;
+ m = file->private_data;
+ iter = m->private;
iter->flags = FTRACE_ITER_FAILURES;
}
@@ -1652,9 +1678,14 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
if (!iter)
return -ENOMEM;
+ if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
+ kfree(iter);
+ return -ENOMEM;
+ }
+
mutex_lock(&ftrace_regex_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_filter_reset(enable);
if (file->f_mode & FMODE_READ) {
@@ -1666,8 +1697,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
if (!ret) {
struct seq_file *m = file->private_data;
m->private = iter;
- } else
+ } else {
+ trace_parser_put(&iter->parser);
kfree(iter);
+ }
} else
file->private_data = iter;
mutex_unlock(&ftrace_regex_lock);
@@ -1700,64 +1733,10 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
return ret;
}
-enum {
- MATCH_FULL,
- MATCH_FRONT_ONLY,
- MATCH_MIDDLE_ONLY,
- MATCH_END_ONLY,
-};
-
-/*
- * (static function - no need for kernel doc)
- *
- * Pass in a buffer containing a glob and this function will
- * set search to point to the search part of the buffer and
- * return the type of search it is (see enum above).
- * This does modify buff.
- *
- * Returns enum type.
- * search returns the pointer to use for comparison.
- * not returns 1 if buff started with a '!'
- * 0 otherwise.
- */
-static int
-ftrace_setup_glob(char *buff, int len, char **search, int *not)
-{
- int type = MATCH_FULL;
- int i;
-
- if (buff[0] == '!') {
- *not = 1;
- buff++;
- len--;
- } else
- *not = 0;
-
- *search = buff;
-
- for (i = 0; i < len; i++) {
- if (buff[i] == '*') {
- if (!i) {
- *search = buff + 1;
- type = MATCH_END_ONLY;
- } else {
- if (type == MATCH_END_ONLY)
- type = MATCH_MIDDLE_ONLY;
- else
- type = MATCH_FRONT_ONLY;
- buff[i] = 0;
- break;
- }
- }
- }
-
- return type;
-}
-
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
- char *ptr;
+ int slen;
switch (type) {
case MATCH_FULL:
@@ -1773,8 +1752,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
matched = 1;
break;
case MATCH_END_ONLY:
- ptr = strstr(str, regex);
- if (ptr && (ptr[len] == 0))
+ slen = strlen(str);
+ if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
matched = 1;
break;
}
@@ -1791,7 +1770,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
return ftrace_match(str, regex, len, type);
}
-static void ftrace_match_records(char *buff, int len, int enable)
+static int ftrace_match_records(char *buff, int len, int enable)
{
unsigned int search_len;
struct ftrace_page *pg;
@@ -1800,9 +1779,10 @@ static void ftrace_match_records(char *buff, int len, int enable)
char *search;
int type;
int not;
+ int found = 0;
flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
- type = ftrace_setup_glob(buff, len, &search, &not);
+ type = filter_parse_regex(buff, len, &search, &not);
search_len = strlen(search);
@@ -1817,6 +1797,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
rec->flags &= ~flag;
else
rec->flags |= flag;
+ found = 1;
}
/*
* Only enable filtering if we have a function that
@@ -1826,6 +1807,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
ftrace_filtered = 1;
} while_for_each_ftrace_rec();
mutex_unlock(&ftrace_lock);
+
+ return found;
}
static int
@@ -1847,7 +1830,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
return 1;
}
-static void ftrace_match_module_records(char *buff, char *mod, int enable)
+static int ftrace_match_module_records(char *buff, char *mod, int enable)
{
unsigned search_len = 0;
struct ftrace_page *pg;
@@ -1856,6 +1839,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
char *search = buff;
unsigned long flag;
int not = 0;
+ int found = 0;
flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
@@ -1870,7 +1854,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
}
if (strlen(buff)) {
- type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+ type = filter_parse_regex(buff, strlen(buff), &search, &not);
search_len = strlen(search);
}
@@ -1886,12 +1870,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
rec->flags &= ~flag;
else
rec->flags |= flag;
+ found = 1;
}
if (enable && (rec->flags & FTRACE_FL_FILTER))
ftrace_filtered = 1;
} while_for_each_ftrace_rec();
mutex_unlock(&ftrace_lock);
+
+ return found;
}
/*
@@ -1920,8 +1907,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
if (!strlen(mod))
return -EINVAL;
- ftrace_match_module_records(func, mod, enable);
- return 0;
+ if (ftrace_match_module_records(func, mod, enable))
+ return 0;
+ return -EINVAL;
}
static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1942,7 +1930,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
struct hlist_head *hhd;
struct hlist_node *n;
unsigned long key;
- int resched;
key = hash_long(ip, FTRACE_HASH_BITS);
@@ -1956,12 +1943,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
* period. This syncs the hash iteration and freeing of items
* on the hash. rcu_read_lock is too dangerous here.
*/
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
hlist_for_each_entry_rcu(entry, n, hhd, node) {
if (entry->ip == ip)
entry->ops->func(ip, parent_ip, &entry->data);
}
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_probe_ops __read_mostly =
@@ -2035,7 +2022,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
int count = 0;
char *search;
- type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+ type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
/* we do not support '!' for function probes */
@@ -2107,12 +2094,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
int i, len = 0;
char *search;
- if (glob && (strcmp(glob, "*") || !strlen(glob)))
+ if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
glob = NULL;
- else {
+ else if (glob) {
int not;
- type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+ type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
/* we do not support '!' for function probes */
@@ -2218,8 +2205,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
func = strsep(&next, ":");
if (!next) {
- ftrace_match_records(func, len, enable);
- return 0;
+ if (ftrace_match_records(func, len, enable))
+ return 0;
+ return ret;
}
/* command found */
@@ -2244,11 +2232,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos, int enable)
{
struct ftrace_iterator *iter;
- char ch;
- size_t read = 0;
- ssize_t ret;
+ struct trace_parser *parser;
+ ssize_t ret, read;
- if (!cnt || cnt < 0)
+ if (!cnt)
return 0;
mutex_lock(&ftrace_regex_lock);
@@ -2259,66 +2246,20 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
} else
iter = file->private_data;
- if (!*ppos) {
- iter->flags &= ~FTRACE_ITER_CONT;
- iter->buffer_idx = 0;
- }
-
- ret = get_user(ch, ubuf++);
- if (ret)
- goto out;
- read++;
- cnt--;
-
- if (!(iter->flags & ~FTRACE_ITER_CONT)) {
- /* skip white space */
- while (cnt && isspace(ch)) {
- ret = get_user(ch, ubuf++);
- if (ret)
- goto out;
- read++;
- cnt--;
- }
+ parser = &iter->parser;
+ read = trace_get_user(parser, ubuf, cnt, ppos);
- if (isspace(ch)) {
- file->f_pos += read;
- ret = read;
- goto out;
- }
-
- iter->buffer_idx = 0;
- }
-
- while (cnt && !isspace(ch)) {
- if (iter->buffer_idx < FTRACE_BUFF_MAX)
- iter->buffer[iter->buffer_idx++] = ch;
- else {
- ret = -EINVAL;
- goto out;
- }
- ret = get_user(ch, ubuf++);
+ if (read >= 0 && trace_parser_loaded(parser) &&
+ !trace_parser_cont(parser)) {
+ ret = ftrace_process_regex(parser->buffer,
+ parser->idx, enable);
+ trace_parser_clear(parser);
if (ret)
- goto out;
- read++;
- cnt--;
+ goto out_unlock;
}
- if (isspace(ch)) {
- iter->filtered++;
- iter->buffer[iter->buffer_idx] = 0;
- ret = ftrace_process_regex(iter->buffer,
- iter->buffer_idx, enable);
- if (ret)
- goto out;
- iter->buffer_idx = 0;
- } else
- iter->flags |= FTRACE_ITER_CONT;
-
-
- file->f_pos += read;
-
ret = read;
- out:
+out_unlock:
mutex_unlock(&ftrace_regex_lock);
return ret;
@@ -2402,6 +2343,34 @@ static int __init set_ftrace_filter(char *str)
}
__setup("ftrace_filter=", set_ftrace_filter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+
+static int __init set_graph_function(char *str)
+{
+ strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_filter=", set_graph_function);
+
+static void __init set_ftrace_early_graph(char *buf)
+{
+ int ret;
+ char *func;
+
+ while (buf) {
+ func = strsep(&buf, ",");
+ /* we allow only one expression at a time */
+ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+ func);
+ if (ret)
+ printk(KERN_DEBUG "ftrace: function %s not "
+ "traceable\n", func);
+ }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
static void __init set_ftrace_early_filter(char *buf, int enable)
{
char *func;
@@ -2418,6 +2387,10 @@ static void __init set_ftrace_early_filters(void)
set_ftrace_early_filter(ftrace_filter_buf, 1);
if (ftrace_notrace_buf[0])
set_ftrace_early_filter(ftrace_notrace_buf, 0);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (ftrace_graph_buf[0])
+ set_ftrace_early_graph(ftrace_graph_buf);
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
static int
@@ -2425,6 +2398,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
+ struct trace_parser *parser;
mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
@@ -2434,10 +2408,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
} else
iter = file->private_data;
- if (iter->buffer_idx) {
- iter->filtered++;
- iter->buffer[iter->buffer_idx] = 0;
- ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
+ parser = &iter->parser;
+ if (trace_parser_loaded(parser)) {
+ parser->buffer[parser->idx] = 0;
+ ftrace_match_records(parser->buffer, parser->idx, enable);
}
mutex_lock(&ftrace_lock);
@@ -2445,7 +2419,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
ftrace_run_update_code(FTRACE_ENABLE_CALLS);
mutex_unlock(&ftrace_lock);
+ trace_parser_put(parser);
kfree(iter);
+
mutex_unlock(&ftrace_regex_lock);
return 0;
}
@@ -2466,14 +2442,14 @@ static const struct file_operations ftrace_avail_fops = {
.open = ftrace_avail_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = ftrace_avail_release,
+ .release = seq_release_private,
};
static const struct file_operations ftrace_failures_fops = {
.open = ftrace_failures_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = ftrace_avail_release,
+ .release = seq_release_private,
};
static const struct file_operations ftrace_filter_fops = {
@@ -2497,35 +2473,33 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
int ftrace_graph_count;
+int ftrace_graph_filter_enabled;
unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
static void *
-g_next(struct seq_file *m, void *v, loff_t *pos)
+__g_next(struct seq_file *m, loff_t *pos)
{
- unsigned long *array = m->private;
- int index = *pos;
-
- (*pos)++;
-
- if (index >= ftrace_graph_count)
+ if (*pos >= ftrace_graph_count)
return NULL;
+ return &ftrace_graph_funcs[*pos];
+}
- return &array[index];
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return __g_next(m, pos);
}
static void *g_start(struct seq_file *m, loff_t *pos)
{
- void *p = NULL;
-
mutex_lock(&graph_lock);
/* Nothing, tell g_show to print all functions are enabled */
- if (!ftrace_graph_count && !*pos)
+ if (!ftrace_graph_filter_enabled && !*pos)
return (void *)1;
- p = g_next(m, p, pos);
-
- return p;
+ return __g_next(m, pos);
}
static void g_stop(struct seq_file *m, void *p)
@@ -2536,7 +2510,6 @@ static void g_stop(struct seq_file *m, void *p)
static int g_show(struct seq_file *m, void *v)
{
unsigned long *ptr = v;
- char str[KSYM_SYMBOL_LEN];
if (!ptr)
return 0;
@@ -2546,14 +2519,12 @@ static int g_show(struct seq_file *m, void *v)
return 0;
}
- kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
-
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%ps\n", (void *)*ptr);
return 0;
}
-static struct seq_operations ftrace_graph_seq_ops = {
+static const struct seq_operations ftrace_graph_seq_ops = {
.start = g_start,
.next = g_next,
.stop = g_stop,
@@ -2570,31 +2541,34 @@ ftrace_graph_open(struct inode *inode, struct file *file)
mutex_lock(&graph_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
+ ftrace_graph_filter_enabled = 0;
ftrace_graph_count = 0;
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
+ mutex_unlock(&graph_lock);
- if (file->f_mode & FMODE_READ) {
+ if (file->f_mode & FMODE_READ)
ret = seq_open(file, &ftrace_graph_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = ftrace_graph_funcs;
- }
- } else
- file->private_data = ftrace_graph_funcs;
- mutex_unlock(&graph_lock);
return ret;
}
static int
+ftrace_graph_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+ return 0;
+}
+
+static int
ftrace_set_func(unsigned long *array, int *idx, char *buffer)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
int search_len;
- int found = 0;
+ int fail = 1;
int type, not;
char *search;
bool exists;
@@ -2604,122 +2578,99 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
return -ENODEV;
/* decode regex */
- type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
- if (not)
- return -EINVAL;
+ type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
+ if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
+ return -EBUSY;
search_len = strlen(search);
mutex_lock(&ftrace_lock);
do_for_each_ftrace_rec(pg, rec) {
- if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
- break;
-
if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
continue;
if (ftrace_match_record(rec, search, search_len, type)) {
- /* ensure it is not already in the array */
+ /* if it is in the array */
exists = false;
- for (i = 0; i < *idx; i++)
+ for (i = 0; i < *idx; i++) {
if (array[i] == rec->ip) {
exists = true;
break;
}
- if (!exists) {
- array[(*idx)++] = rec->ip;
- found = 1;
+ }
+
+ if (!not) {
+ fail = 0;
+ if (!exists) {
+ array[(*idx)++] = rec->ip;
+ if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+ goto out;
+ }
+ } else {
+ if (exists) {
+ array[i] = array[--(*idx)];
+ array[*idx] = 0;
+ fail = 0;
+ }
}
}
} while_for_each_ftrace_rec();
-
+out:
mutex_unlock(&ftrace_lock);
- return found ? 0 : -EINVAL;
+ if (fail)
+ return -EINVAL;
+
+ ftrace_graph_filter_enabled = 1;
+ return 0;
}
static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- unsigned char buffer[FTRACE_BUFF_MAX+1];
- unsigned long *array;
- size_t read = 0;
- ssize_t ret;
- int index = 0;
- char ch;
+ struct trace_parser parser;
+ ssize_t read, ret;
- if (!cnt || cnt < 0)
+ if (!cnt)
return 0;
mutex_lock(&graph_lock);
- if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
- ret = -EBUSY;
- goto out;
+ if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- if (file->f_mode & FMODE_READ) {
- struct seq_file *m = file->private_data;
- array = m->private;
- } else
- array = file->private_data;
+ read = trace_get_user(&parser, ubuf, cnt, ppos);
- ret = get_user(ch, ubuf++);
- if (ret)
- goto out;
- read++;
- cnt--;
+ if (read >= 0 && trace_parser_loaded((&parser))) {
+ parser.buffer[parser.idx] = 0;
- /* skip white space */
- while (cnt && isspace(ch)) {
- ret = get_user(ch, ubuf++);
+ /* we allow only one expression at a time */
+ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+ parser.buffer);
if (ret)
- goto out;
- read++;
- cnt--;
- }
-
- if (isspace(ch)) {
- *ppos += read;
- ret = read;
- goto out;
- }
-
- while (cnt && !isspace(ch)) {
- if (index < FTRACE_BUFF_MAX)
- buffer[index++] = ch;
- else {
- ret = -EINVAL;
- goto out;
- }
- ret = get_user(ch, ubuf++);
- if (ret)
- goto out;
- read++;
- cnt--;
+ goto out_free;
}
- buffer[index] = 0;
-
- /* we allow only one expression at a time */
- ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
- if (ret)
- goto out;
-
- file->f_pos += read;
ret = read;
- out:
+
+out_free:
+ trace_parser_put(&parser);
+out_unlock:
mutex_unlock(&graph_lock);
return ret;
}
static const struct file_operations ftrace_graph_fops = {
- .open = ftrace_graph_open,
- .read = seq_read,
- .write = ftrace_graph_write,
+ .open = ftrace_graph_open,
+ .read = seq_read,
+ .write = ftrace_graph_write,
+ .release = ftrace_graph_release,
+ .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -2747,7 +2698,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
return 0;
}
-static int ftrace_convert_nops(struct module *mod,
+static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
@@ -2780,19 +2731,17 @@ static int ftrace_convert_nops(struct module *mod,
}
#ifdef CONFIG_MODULES
-void ftrace_release(void *start, void *end)
+void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- unsigned long s = (unsigned long)start;
- unsigned long e = (unsigned long)end;
- if (ftrace_disabled || !start || start == end)
+ if (ftrace_disabled)
return;
mutex_lock(&ftrace_lock);
do_for_each_ftrace_rec(pg, rec) {
- if ((rec->ip >= s) && (rec->ip < e)) {
+ if (within_module_core(rec->ip, mod)) {
/*
* rec->ip is changed in ftrace_free_rec()
* It should not between s and e if record was freed.
@@ -2809,7 +2758,7 @@ static void ftrace_init_module(struct module *mod,
{
if (ftrace_disabled || start == end)
return;
- ftrace_convert_nops(mod, start, end);
+ ftrace_process_locs(mod, start, end);
}
static int ftrace_module_notify(struct notifier_block *self,
@@ -2824,9 +2773,7 @@ static int ftrace_module_notify(struct notifier_block *self,
mod->num_ftrace_callsites);
break;
case MODULE_STATE_GOING:
- ftrace_release(mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
+ ftrace_release_mod(mod);
break;
}
@@ -2872,7 +2819,7 @@ void __init ftrace_init(void)
last_ftrace_enabled = ftrace_enabled = 1;
- ret = ftrace_convert_nops(NULL,
+ ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
@@ -2905,23 +2852,6 @@ static inline void ftrace_startup_enable(int command) { }
# define ftrace_shutdown_sysctl() do { } while (0)
#endif /* CONFIG_DYNAMIC_FTRACE */
-static ssize_t
-ftrace_pid_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- int r;
-
- if (ftrace_pid_trace == ftrace_swapper_pid)
- r = sprintf(buf, "swapper tasks\n");
- else if (ftrace_pid_trace)
- r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
- else
- r = sprintf(buf, "no pid\n");
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
@@ -2972,14 +2902,12 @@ static void set_ftrace_pid(struct pid *pid)
rcu_read_unlock();
}
-static void clear_ftrace_pid_task(struct pid **pid)
+static void clear_ftrace_pid_task(struct pid *pid)
{
- if (*pid == ftrace_swapper_pid)
+ if (pid == ftrace_swapper_pid)
clear_ftrace_swapper();
else
- clear_ftrace_pid(*pid);
-
- *pid = NULL;
+ clear_ftrace_pid(pid);
}
static void set_ftrace_pid_task(struct pid *pid)
@@ -2990,74 +2918,184 @@ static void set_ftrace_pid_task(struct pid *pid)
set_ftrace_pid(pid);
}
-static ssize_t
-ftrace_pid_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int ftrace_pid_add(int p)
{
struct pid *pid;
- char buf[64];
- long val;
- int ret;
+ struct ftrace_pid *fpid;
+ int ret = -EINVAL;
- if (cnt >= sizeof(buf))
- return -EINVAL;
+ mutex_lock(&ftrace_lock);
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+ if (!p)
+ pid = ftrace_swapper_pid;
+ else
+ pid = find_get_pid(p);
- buf[cnt] = 0;
+ if (!pid)
+ goto out;
- ret = strict_strtol(buf, 10, &val);
- if (ret < 0)
- return ret;
+ ret = 0;
- mutex_lock(&ftrace_lock);
- if (val < 0) {
- /* disable pid tracing */
- if (!ftrace_pid_trace)
- goto out;
+ list_for_each_entry(fpid, &ftrace_pids, list)
+ if (fpid->pid == pid)
+ goto out_put;
- clear_ftrace_pid_task(&ftrace_pid_trace);
+ ret = -ENOMEM;
- } else {
- /* swapper task is special */
- if (!val) {
- pid = ftrace_swapper_pid;
- if (pid == ftrace_pid_trace)
- goto out;
- } else {
- pid = find_get_pid(val);
+ fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
+ if (!fpid)
+ goto out_put;
- if (pid == ftrace_pid_trace) {
- put_pid(pid);
- goto out;
- }
- }
+ list_add(&fpid->list, &ftrace_pids);
+ fpid->pid = pid;
- if (ftrace_pid_trace)
- clear_ftrace_pid_task(&ftrace_pid_trace);
+ set_ftrace_pid_task(pid);
- if (!pid)
- goto out;
+ ftrace_update_pid_func();
+ ftrace_startup_enable(0);
+
+ mutex_unlock(&ftrace_lock);
+ return 0;
+
+out_put:
+ if (pid != ftrace_swapper_pid)
+ put_pid(pid);
+
+out:
+ mutex_unlock(&ftrace_lock);
+ return ret;
+}
+
+static void ftrace_pid_reset(void)
+{
+ struct ftrace_pid *fpid, *safe;
+
+ mutex_lock(&ftrace_lock);
+ list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
+ struct pid *pid = fpid->pid;
- ftrace_pid_trace = pid;
+ clear_ftrace_pid_task(pid);
- set_ftrace_pid_task(ftrace_pid_trace);
+ list_del(&fpid->list);
+ kfree(fpid);
}
- /* update the function call */
ftrace_update_pid_func();
ftrace_startup_enable(0);
- out:
mutex_unlock(&ftrace_lock);
+}
- return cnt;
+static void *fpid_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&ftrace_lock);
+
+ if (list_empty(&ftrace_pids) && (!*pos))
+ return (void *) 1;
+
+ return seq_list_start(&ftrace_pids, *pos);
+}
+
+static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ if (v == (void *)1)
+ return NULL;
+
+ return seq_list_next(v, &ftrace_pids, pos);
+}
+
+static void fpid_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&ftrace_lock);
+}
+
+static int fpid_show(struct seq_file *m, void *v)
+{
+ const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
+
+ if (v == (void *)1) {
+ seq_printf(m, "no pid\n");
+ return 0;
+ }
+
+ if (fpid->pid == ftrace_swapper_pid)
+ seq_printf(m, "swapper tasks\n");
+ else
+ seq_printf(m, "%u\n", pid_vnr(fpid->pid));
+
+ return 0;
+}
+
+static const struct seq_operations ftrace_pid_sops = {
+ .start = fpid_start,
+ .next = fpid_next,
+ .stop = fpid_stop,
+ .show = fpid_show,
+};
+
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_pid_reset();
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_open(file, &ftrace_pid_sops);
+
+ return ret;
+}
+
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64], *tmp;
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ /*
+ * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
+ * to clean the filter quietly.
+ */
+ tmp = strstrip(buf);
+ if (strlen(tmp) == 0)
+ return 1;
+
+ ret = strict_strtol(tmp, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = ftrace_pid_add(val);
+
+ return ret ? ret : cnt;
+}
+
+static int
+ftrace_pid_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+
+ return 0;
}
static const struct file_operations ftrace_pid_fops = {
- .read = ftrace_pid_read,
- .write = ftrace_pid_write,
+ .open = ftrace_pid_open,
+ .write = ftrace_pid_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ftrace_pid_release,
};
static __init int ftrace_init_debugfs(void)
@@ -3140,7 +3178,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
@@ -3150,12 +3188,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
mutex_lock(&ftrace_lock);
- ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+ if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;
- last_ftrace_enabled = ftrace_enabled;
+ last_ftrace_enabled = !!ftrace_enabled;
if (ftrace_enabled) {
@@ -3243,8 +3281,8 @@ free:
}
static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
- struct task_struct *next)
+ftrace_graph_probe_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
unsigned long long timestamp;
int index;
@@ -3298,7 +3336,7 @@ static int start_graph_tracing(void)
} while (ret == -EAGAIN);
if (!ret) {
- ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
if (ret)
pr_info("ftrace_graph: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -3370,11 +3408,11 @@ void unregister_ftrace_graph(void)
goto out;
ftrace_graph_active--;
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
out:
mutex_unlock(&ftrace_lock);
@@ -3385,6 +3423,7 @@ void ftrace_graph_init_task(struct task_struct *t)
{
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
+ t->curr_ret_stack = -1;
if (ftrace_graph_active) {
struct ftrace_ret_stack *ret_stack;
@@ -3394,7 +3433,6 @@ void ftrace_graph_init_task(struct task_struct *t)
GFP_KERNEL);
if (!ret_stack)
return;
- t->curr_ret_stack = -1;
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->ftrace_timestamp = 0;
@@ -3420,4 +3458,3 @@ void ftrace_graph_stop(void)
ftrace_stop();
}
#endif
-
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index 1edaa9516e8..00000000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL 0x1
-
-static struct tracer_opt kmem_opts[] = {
- /* Default disable the minimalistic output */
- { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
- { }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
- .val = 0,
- .opts = kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- struct ftrace_event_call *call = &event_kmem_alloc;
- struct trace_array *tr = kmemtrace_array;
- struct kmemtrace_alloc_entry *entry;
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
- if (!event)
- return;
-
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
-
- entry->ent.type = TRACE_KMEM_ALLOC;
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
- entry->bytes_req = bytes_req;
- entry->bytes_alloc = bytes_alloc;
- entry->gfp_flags = gfp_flags;
- entry->node = node;
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-
- trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
- unsigned long call_site,
- const void *ptr)
-{
- struct ftrace_event_call *call = &event_kmem_free;
- struct trace_array *tr = kmemtrace_array;
- struct kmemtrace_free_entry *entry;
- struct ring_buffer_event *event;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
-
- entry->ent.type = TRACE_KMEM_FREE;
- entry->type_id = type_id;
- entry->call_site = call_site;
- entry->ptr = ptr;
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-
- trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
- const void *ptr,
- size_t bytes_req,
- size_t bytes_alloc,
- gfp_t gfp_flags,
- int node)
-{
- kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
- bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
-{
- kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
-{
- kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
- int err;
-
- err = register_trace_kmalloc(kmemtrace_kmalloc);
- if (err)
- return err;
- err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- if (err)
- return err;
- err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
- if (err)
- return err;
- err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- if (err)
- return err;
- err = register_trace_kfree(kmemtrace_kfree);
- if (err)
- return err;
- err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-
- return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
- unregister_trace_kmalloc(kmemtrace_kmalloc);
- unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
- unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- unregister_trace_kfree(kmemtrace_kfree);
- unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
- int cpu;
- kmemtrace_array = tr;
-
- for_each_cpu(cpu, cpu_possible_mask)
- tracing_reset(tr, cpu);
-
- kmemtrace_start_probes();
-
- return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
- kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
- /* Don't need headers for the original kmemtrace output */
- if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
- return;
-
- seq_printf(s, "#\n");
- seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
- " POINTER NODE CALLER\n");
- seq_printf(s, "# FREE | | | | "
- " | | | |\n");
- seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC 0
-#define KMEMTRACE_USER_FREE 1
-
-struct kmemtrace_user_event {
- u8 event_id;
- u8 type_id;
- u16 event_size;
- u32 cpu;
- u64 timestamp;
- unsigned long call_site;
- unsigned long ptr;
-};
-
-struct kmemtrace_user_event_alloc {
- size_t bytes_req;
- size_t bytes_alloc;
- unsigned gfp_flags;
- int node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter,
- struct kmemtrace_alloc_entry *entry)
-{
- struct kmemtrace_user_event_alloc *ev_alloc;
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_user_event *ev;
-
- ev = trace_seq_reserve(s, sizeof(*ev));
- if (!ev)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev->event_id = KMEMTRACE_USER_ALLOC;
- ev->type_id = entry->type_id;
- ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
- ev->cpu = iter->cpu;
- ev->timestamp = iter->ts;
- ev->call_site = entry->call_site;
- ev->ptr = (unsigned long)entry->ptr;
-
- ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
- if (!ev_alloc)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev_alloc->bytes_req = entry->bytes_req;
- ev_alloc->bytes_alloc = entry->bytes_alloc;
- ev_alloc->gfp_flags = entry->gfp_flags;
- ev_alloc->node = entry->node;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter,
- struct kmemtrace_free_entry *entry)
-{
- struct trace_seq *s = &iter->seq;
- struct kmemtrace_user_event *ev;
-
- ev = trace_seq_reserve(s, sizeof(*ev));
- if (!ev)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ev->event_id = KMEMTRACE_USER_FREE;
- ev->type_id = entry->type_id;
- ev->event_size = sizeof(*ev);
- ev->cpu = iter->cpu;
- ev->timestamp = iter->ts;
- ev->call_site = entry->call_site;
- ev->ptr = (unsigned long)entry->ptr;
-
- return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter,
- struct kmemtrace_alloc_entry *entry)
-{
- struct trace_seq *s = &iter->seq;
- int ret;
-
- /* Alloc entry */
- ret = trace_seq_printf(s, " + ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Type */
- switch (entry->type_id) {
- case KMEMTRACE_TYPE_KMALLOC:
- ret = trace_seq_printf(s, "K ");
- break;
- case KMEMTRACE_TYPE_CACHE:
- ret = trace_seq_printf(s, "C ");
- break;
- case KMEMTRACE_TYPE_PAGES:
- ret = trace_seq_printf(s, "P ");
- break;
- default:
- ret = trace_seq_printf(s, "? ");
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Requested */
- ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Allocated */
- ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Flags
- * TODO: would be better to see the name of the GFP flag names
- */
- ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Pointer to allocated */
- ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Node */
- ret = trace_seq_printf(s, "%4d ", entry->node);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Call site */
- ret = seq_print_ip_sym(s, entry->call_site, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- if (!trace_seq_printf(s, "\n"))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter,
- struct kmemtrace_free_entry *entry)
-{
- struct trace_seq *s = &iter->seq;
- int ret;
-
- /* Free entry */
- ret = trace_seq_printf(s, " - ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Type */
- switch (entry->type_id) {
- case KMEMTRACE_TYPE_KMALLOC:
- ret = trace_seq_printf(s, "K ");
- break;
- case KMEMTRACE_TYPE_CACHE:
- ret = trace_seq_printf(s, "C ");
- break;
- case KMEMTRACE_TYPE_PAGES:
- ret = trace_seq_printf(s, "P ");
- break;
- default:
- ret = trace_seq_printf(s, "? ");
- }
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Skip requested/allocated/flags */
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Pointer to allocated */
- ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Skip node */
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Call site */
- ret = seq_print_ip_sym(s, entry->call_site, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- if (!trace_seq_printf(s, "\n"))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
-
- switch (entry->type) {
- case TRACE_KMEM_ALLOC: {
- struct kmemtrace_alloc_entry *field;
-
- trace_assign_type(field, entry);
- if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
- return kmemtrace_print_alloc_compress(iter, field);
- else
- return kmemtrace_print_alloc_user(iter, field);
- }
-
- case TRACE_KMEM_FREE: {
- struct kmemtrace_free_entry *field;
-
- trace_assign_type(field, entry);
- if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
- return kmemtrace_print_free_compress(iter, field);
- else
- return kmemtrace_print_free_user(iter, field);
- }
-
- default:
- return TRACE_TYPE_UNHANDLED;
- }
-}
-
-static struct tracer kmem_tracer __read_mostly = {
- .name = "kmemtrace",
- .init = kmem_trace_init,
- .reset = kmem_trace_reset,
- .print_line = kmemtrace_print_line,
- .print_header = kmemtrace_headers,
- .flags = &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
- /* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
- return register_tracer(&kmem_tracer);
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 00000000000..f55fcf61b22
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power.h>
+
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 04dac263825..bd1c35a4fbc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/fs.h>
+#include <asm/local.h>
#include "trace.h"
/*
@@ -201,13 +203,19 @@ int tracing_is_on(void)
}
EXPORT_SYMBOL_GPL(tracing_is_on);
-#include "trace.h"
-
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -216,19 +224,17 @@ enum {
RB_LEN_TIME_STAMP = 16,
};
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
- return event->type_len == RINGBUF_TYPE_PADDING
- && event->time_delta == 0;
-}
+#define skip_time_extend(event) \
+ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
-static inline int rb_discarded_event(struct ring_buffer_event *event)
+static inline int rb_null_event(struct ring_buffer_event *event)
{
- return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
+ return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
static void rb_event_set_padding(struct ring_buffer_event *event)
{
+ /* padding has a NULL time_delta */
event->type_len = RINGBUF_TYPE_PADDING;
event->time_delta = 0;
}
@@ -245,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
return length + RB_EVNT_HDR_SIZE;
}
-/* inline for ring buffer fast paths */
-static unsigned
+/*
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
rb_event_length(struct ring_buffer_event *event)
{
switch (event->type_len) {
@@ -271,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
return 0;
}
+/*
+ * Return total length of time extend and data,
+ * or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+ unsigned len = 0;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ /* time extends include the data event after it */
+ len = RB_LEN_TIME_EXTEND;
+ event = skip_time_extend(event);
+ }
+ return len + rb_event_length(event);
+}
+
/**
* ring_buffer_event_length - return the length of the event
* @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
*/
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
- unsigned length = rb_event_length(event);
+ unsigned length;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
+ length = rb_event_length(event);
if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
@@ -291,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static void *
rb_event_data(struct ring_buffer_event *event)
{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
if (event->type_len)
@@ -316,20 +356,49 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS (1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED (1 << 30)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
unsigned char data[]; /* data of buffer page */
};
+/*
+ * Note, the buffer_page list must be first. The buffer pages
+ * are allocated in cache lines, which means that each buffer
+ * page will be at the beginning of a cache line, and thus
+ * the least significant bits will be zero. We use this to
+ * add flags in the list struct pointers, to make the ring buffer
+ * lockless.
+ */
struct buffer_page {
struct list_head list; /* list of buffer pages */
local_t write; /* index for next write */
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
+ unsigned long real_end; /* real end of data */
struct buffer_data_page *page; /* Actual data page */
};
+/*
+ * The buffer page counters, write and entries, must be reset
+ * atomically when crossing page boundaries. To synchronize this
+ * update, two counters are inserted into the number. One is
+ * the actual counter for the write position or count on the page.
+ *
+ * The other is a counter of updaters. Before an update happens
+ * the update partition of the counter is incremented. This will
+ * allow the updater to update the counter atomically.
+ *
+ * The counter is 20 bits, and the state data is 12.
+ */
+#define RB_WRITE_MASK 0xfffff
+#define RB_WRITE_INTCNT (1 << 20)
+
static void rb_init_page(struct buffer_data_page *bpage)
{
local_set(&bpage->commit, 0);
@@ -372,27 +441,33 @@ static inline int test_time_stamp(u64 delta)
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
-
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
int ret;
ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\n",
- (unsigned int)sizeof(field.time_stamp));
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\n",
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit));
+ 1,
+ (unsigned int)is_signed_type(long));
ret = trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\n",
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE);
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
return ret;
}
@@ -402,25 +477,26 @@ int ring_buffer_print_page_header(struct trace_seq *s)
*/
struct ring_buffer_per_cpu {
int cpu;
+ atomic_t record_disabled;
struct ring_buffer *buffer;
- spinlock_t reader_lock; /* serialize readers */
- raw_spinlock_t lock;
+ spinlock_t reader_lock; /* serialize readers */
+ arch_spinlock_t lock;
struct lock_class_key lock_key;
- struct list_head pages;
+ struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
struct buffer_page *reader_page;
- unsigned long nmi_dropped;
- unsigned long commit_overrun;
- unsigned long overrun;
- unsigned long read;
+ unsigned long lost_events;
+ unsigned long last_overrun;
+ local_t commit_overrun;
+ local_t overrun;
local_t entries;
local_t committing;
local_t commits;
+ unsigned long read;
u64 write_stamp;
u64 read_stamp;
- atomic_t record_disabled;
};
struct ring_buffer {
@@ -446,24 +522,31 @@ struct ring_buffer_iter {
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long head;
struct buffer_page *head_page;
+ struct buffer_page *cache_reader_page;
+ unsigned long cache_read;
u64 read_stamp;
};
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
-#define RB_WARN_ON(buffer, cond) \
- ({ \
- int _____ret = unlikely(cond); \
- if (_____ret) { \
- atomic_inc(&buffer->record_disabled); \
- WARN_ON(1); \
- } \
- _____ret; \
+#define RB_WARN_ON(b, cond) \
+ ({ \
+ int _____ret = unlikely(cond); \
+ if (_____ret) { \
+ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
+ struct ring_buffer_per_cpu *__b = \
+ (void *)b; \
+ atomic_inc(&__b->buffer->record_disabled); \
+ } else \
+ atomic_inc(&b->record_disabled); \
+ WARN_ON(1); \
+ } \
+ _____ret; \
})
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
-static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+static inline u64 rb_time_stamp(struct ring_buffer *buffer)
{
/* shift to debug/test normalization and TIME_EXTENTS */
return buffer->clock() << DEBUG_SHIFT;
@@ -474,7 +557,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
u64 time;
preempt_disable_notrace();
- time = rb_time_stamp(buffer, cpu);
+ time = rb_time_stamp(buffer);
preempt_enable_no_resched_notrace();
return time;
@@ -489,6 +572,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+/*
+ * Making the ring buffer lockless makes things tricky.
+ * Although writes only happen on the CPU that they are on,
+ * and they only need to worry about interrupts. Reads can
+ * happen on any CPU.
+ *
+ * The reader page is always off the ring buffer, but when the
+ * reader finishes with a page, it needs to swap its page with
+ * a new one from the buffer. The reader needs to take from
+ * the head (writes go to the tail). But if a writer is in overwrite
+ * mode and wraps, it must push the head page forward.
+ *
+ * Here lies the problem.
+ *
+ * The reader must be careful to replace only the head page, and
+ * not another one. As described at the top of the file in the
+ * ASCII art, the reader sets its old page to point to the next
+ * page after head. It then sets the page after head to point to
+ * the old reader page. But if the writer moves the head page
+ * during this operation, the reader could end up with the tail.
+ *
+ * We use cmpxchg to help prevent this race. We also do something
+ * special with the page before head. We set the LSB to 1.
+ *
+ * When the writer must push the page forward, it will clear the
+ * bit that points to the head page, move the head, and then set
+ * the bit that points to the new head page.
+ *
+ * We also don't want an interrupt coming in and moving the head
+ * page on another writer. Thus we use the second LSB to catch
+ * that too. Thus:
+ *
+ * head->list->prev->next bit 1 bit 0
+ * ------- -------
+ * Normal page 0 0
+ * Points to head page 0 1
+ * New head page 1 0
+ *
+ * Note we can not trust the prev pointer of the head page, because:
+ *
+ * +----+ +-----+ +-----+
+ * | |------>| T |---X--->| N |
+ * | |<------| | | |
+ * +----+ +-----+ +-----+
+ * ^ ^ |
+ * | +-----+ | |
+ * +----------| R |----------+ |
+ * | |<-----------+
+ * +-----+
+ *
+ * Key: ---X--> HEAD flag set in pointer
+ * T Tail page
+ * R Reader page
+ * N Next page
+ *
+ * (see __rb_reserve_next() to see where this happens)
+ *
+ * What the above shows is that the reader just swapped out
+ * the reader page with a page in the buffer, but before it
+ * could make the new header point back to the new page added
+ * it was preempted by a writer. The writer moved forward onto
+ * the new page added by the reader and is about to move forward
+ * again.
+ *
+ * You can see, it is legitimate for the previous pointer of
+ * the head (or any page) not to point back to itself. But only
+ * temporarially.
+ */
+
+#define RB_PAGE_NORMAL 0UL
+#define RB_PAGE_HEAD 1UL
+#define RB_PAGE_UPDATE 2UL
+
+
+#define RB_FLAG_MASK 3UL
+
+/* PAGE_MOVED is not part of the mask */
+#define RB_PAGE_MOVED 4UL
+
+/*
+ * rb_list_head - remove any bit
+ */
+static struct list_head *rb_list_head(struct list_head *list)
+{
+ unsigned long val = (unsigned long)list;
+
+ return (struct list_head *)(val & ~RB_FLAG_MASK);
+}
+
+/*
+ * rb_is_head_page - test if the given page is the head page
+ *
+ * Because the reader may move the head_page pointer, we can
+ * not trust what the head page is (it may be pointing to
+ * the reader page). But if the next page is a header page,
+ * its flags will be non zero.
+ */
+static int inline
+rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *page, struct list_head *list)
+{
+ unsigned long val;
+
+ val = (unsigned long)list->next;
+
+ if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
+ return RB_PAGE_MOVED;
+
+ return val & RB_FLAG_MASK;
+}
+
+/*
+ * rb_is_reader_page
+ *
+ * The unique thing about the reader page, is that, if the
+ * writer is ever on it, the previous pointer never points
+ * back to the reader page.
+ */
+static int rb_is_reader_page(struct buffer_page *page)
+{
+ struct list_head *list = page->list.prev;
+
+ return rb_list_head(list->next) != &page->list;
+}
+
+/*
+ * rb_set_list_to_head - set a list_head to be pointing to head.
+ */
+static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ unsigned long *ptr;
+
+ ptr = (unsigned long *)&list->next;
+ *ptr |= RB_PAGE_HEAD;
+ *ptr &= ~RB_PAGE_UPDATE;
+}
+
+/*
+ * rb_head_page_activate - sets up head page
+ */
+static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+
+ head = cpu_buffer->head_page;
+ if (!head)
+ return;
+
+ /*
+ * Set the previous list pointer to have the HEAD flag.
+ */
+ rb_set_list_to_head(cpu_buffer, head->list.prev);
+}
+
+static void rb_list_head_clear(struct list_head *list)
+{
+ unsigned long *ptr = (unsigned long *)&list->next;
+
+ *ptr &= ~RB_FLAG_MASK;
+}
+
+/*
+ * rb_head_page_dactivate - clears head page ptr (for free list)
+ */
+static void
+rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *hd;
+
+ /* Go through the whole list and clear any pointers found. */
+ rb_list_head_clear(cpu_buffer->pages);
+
+ list_for_each(hd, cpu_buffer->pages)
+ rb_list_head_clear(hd);
+}
+
+static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag, int new_flag)
+{
+ struct list_head *list;
+ unsigned long val = (unsigned long)&head->list;
+ unsigned long ret;
+
+ list = &prev->list;
+
+ val &= ~RB_FLAG_MASK;
+
+ ret = cmpxchg((unsigned long *)&list->next,
+ val | old_flag, val | new_flag);
+
+ /* check if the reader took the page */
+ if ((ret & ~RB_FLAG_MASK) != val)
+ return RB_PAGE_MOVED;
+
+ return ret & RB_FLAG_MASK;
+}
+
+static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_UPDATE);
+}
+
+static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_HEAD);
+}
+
+static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_NORMAL);
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.next);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
+static struct buffer_page *
+rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+ struct buffer_page *page;
+ struct list_head *list;
+ int i;
+
+ if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
+ return NULL;
+
+ /* sanity check */
+ list = cpu_buffer->pages;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
+ return NULL;
+
+ page = head = cpu_buffer->head_page;
+ /*
+ * It is possible that the writer moves the header behind
+ * where we started, and we miss in one loop.
+ * A second loop should grab the header, but we'll do
+ * three loops just because I'm paranoid.
+ */
+ for (i = 0; i < 3; i++) {
+ do {
+ if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ cpu_buffer->head_page = page;
+ return page;
+ }
+ rb_inc_page(cpu_buffer, &page);
+ } while (page != head);
+ }
+
+ RB_WARN_ON(cpu_buffer, 1);
+
+ return NULL;
+}
+
+static int rb_head_page_replace(struct buffer_page *old,
+ struct buffer_page *new)
+{
+ unsigned long *ptr = (unsigned long *)&old->list.prev->next;
+ unsigned long val;
+ unsigned long ret;
+
+ val = *ptr & ~RB_FLAG_MASK;
+ val |= RB_PAGE_HEAD;
+
+ ret = cmpxchg(ptr, val, (unsigned long)&new->list);
+
+ return ret == val;
+}
+
+/*
+ * rb_tail_page_update - move the tail page forward
+ *
+ * Returns 1 if moved tail page, 0 if someone else did.
+ */
+static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *old_tail;
+ unsigned long old_entries;
+ unsigned long old_write;
+ int ret = 0;
+
+ /*
+ * The tail page now needs to be moved forward.
+ *
+ * We need to reset the tail page, but without messing
+ * with possible erasing of data brought in by interrupts
+ * that have moved the tail page and are currently on it.
+ *
+ * We add a counter to the write field to denote this.
+ */
+ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
+ old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+
+ /*
+ * Just make sure we have seen our old_write and synchronize
+ * with any interrupts that come in.
+ */
+ barrier();
+
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ /* Zero the write counter */
+ unsigned long val = old_write & ~RB_WRITE_MASK;
+ unsigned long eval = old_entries & ~RB_WRITE_MASK;
+
+ /*
+ * This will only succeed if an interrupt did
+ * not come in and change it. In which case, we
+ * do not want to modify it.
+ *
+ * We add (void) to let the compiler know that we do not care
+ * about the return value of these functions. We use the
+ * cmpxchg to only update if an interrupt did not already
+ * do it for us. If the cmpxchg fails, we don't care.
+ */
+ (void)local_cmpxchg(&next_page->write, old_write, val);
+ (void)local_cmpxchg(&next_page->entries, old_entries, eval);
+
+ /*
+ * No need to worry about races with clearing out the commit.
+ * it only can increment when a commit takes place. But that
+ * only happens in the outer most nested commit.
+ */
+ local_set(&next_page->page->commit, 0);
+
+ old_tail = cmpxchg(&cpu_buffer->tail_page,
+ tail_page, next_page);
+
+ if (old_tail == tail_page)
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ unsigned long val = (unsigned long)bpage;
+
+ if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * rb_check_list - make sure a pointer to a list has the last bits zero
+ */
+static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
+ return 1;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
+ return 1;
+ return 0;
+}
+
/**
* check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +965,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
*/
static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = &cpu_buffer->pages;
+ struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ rb_head_page_deactivate(cpu_buffer);
+
if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
return -1;
if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
return -1;
+ if (rb_check_list(cpu_buffer, head))
+ return -1;
+
list_for_each_entry_safe(bpage, tmp, head, list) {
if (RB_WARN_ON(cpu_buffer,
bpage->list.next->prev != &bpage->list))
@@ -513,25 +985,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
bpage->list.prev->next != &bpage->list))
return -1;
+ if (rb_check_list(cpu_buffer, &bpage->list))
+ return -1;
}
+ rb_head_page_activate(cpu_buffer);
+
return 0;
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
unsigned nr_pages)
{
- struct list_head *head = &cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
unsigned long addr;
LIST_HEAD(pages);
unsigned i;
+ WARN_ON(!nr_pages);
+
for (i = 0; i < nr_pages; i++) {
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, &pages);
addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +1021,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}
- list_splice(&pages, head);
+ /*
+ * The ring buffer page list is a circular list that does not
+ * start and end with a list head. All page list items point to
+ * other pages.
+ */
+ cpu_buffer->pages = pages.next;
+ list_del(&pages);
rb_check_pages(cpu_buffer);
@@ -572,14 +1058,15 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
cpu_buffer->buffer = buffer;
spin_lock_init(&cpu_buffer->reader_lock);
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
- cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- INIT_LIST_HEAD(&cpu_buffer->pages);
+ cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
goto fail_free_buffer;
+ rb_check_bpage(cpu_buffer, bpage);
+
cpu_buffer->reader_page = bpage;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
@@ -594,9 +1081,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
goto fail_free_reader;
cpu_buffer->head_page
- = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_head_page_activate(cpu_buffer);
+
return cpu_buffer;
fail_free_reader:
@@ -609,15 +1098,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = &cpu_buffer->pages;
+ struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
free_buffer_page(cpu_buffer->reader_page);
- list_for_each_entry_safe(bpage, tmp, head, list) {
- list_del_init(&bpage->list);
+ rb_head_page_deactivate(cpu_buffer);
+
+ if (head) {
+ list_for_each_entry_safe(bpage, tmp, head, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ bpage = list_entry(head, struct buffer_page, list);
free_buffer_page(bpage);
}
+
kfree(cpu_buffer);
}
@@ -735,6 +1231,7 @@ ring_buffer_free(struct ring_buffer *buffer)
put_online_cpus();
+ kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
kfree(buffer);
@@ -756,26 +1253,25 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
struct list_head *p;
unsigned i;
- atomic_inc(&cpu_buffer->record_disabled);
- synchronize_sched();
+ spin_lock_irq(&cpu_buffer->reader_lock);
+ rb_head_page_deactivate(cpu_buffer);
for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
- return;
- p = cpu_buffer->pages.next;
+ if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
+ goto out;
+ p = cpu_buffer->pages->next;
bpage = list_entry(p, struct buffer_page, list);
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
- if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
- return;
+ if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
+ goto out;
rb_reset_cpu(cpu_buffer);
-
rb_check_pages(cpu_buffer);
- atomic_dec(&cpu_buffer->record_disabled);
-
+out:
+ spin_unlock_irq(&cpu_buffer->reader_lock);
}
static void
@@ -786,22 +1282,22 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct list_head *p;
unsigned i;
- atomic_inc(&cpu_buffer->record_disabled);
- synchronize_sched();
+ spin_lock_irq(&cpu_buffer->reader_lock);
+ rb_head_page_deactivate(cpu_buffer);
for (i = 0; i < nr_pages; i++) {
if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
- return;
+ goto out;
p = pages->next;
bpage = list_entry(p, struct buffer_page, list);
list_del_init(&bpage->list);
- list_add_tail(&bpage->list, &cpu_buffer->pages);
+ list_add_tail(&bpage->list, cpu_buffer->pages);
}
rb_reset_cpu(cpu_buffer);
-
rb_check_pages(cpu_buffer);
- atomic_dec(&cpu_buffer->record_disabled);
+out:
+ spin_unlock_irq(&cpu_buffer->reader_lock);
}
/**
@@ -809,11 +1305,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
* @buffer: the buffer to resize.
* @size: the new size.
*
- * The tracer is responsible for making sure that the buffer is
- * not being used while changing the size.
- * Note: We may be able to change the above requirement by using
- * RCU synchronizations.
- *
* Minimum size is 2 * BUF_PAGE_SIZE.
*
* Returns -1 on failure.
@@ -845,6 +1336,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
if (size == buffer_size)
return size;
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all writers are done with this buffer. */
+ synchronize_sched();
+
mutex_lock(&buffer->mutex);
get_online_cpus();
@@ -907,6 +1403,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
put_online_cpus();
mutex_unlock(&buffer->mutex);
+ atomic_dec(&buffer->record_disabled);
+
return size;
free_pages:
@@ -916,6 +1414,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
}
put_online_cpus();
mutex_unlock(&buffer->mutex);
+ atomic_dec(&buffer->record_disabled);
return -ENOMEM;
/*
@@ -925,6 +1424,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
out_fail:
put_online_cpus();
mutex_unlock(&buffer->mutex);
+ atomic_dec(&buffer->record_disabled);
return -1;
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -948,21 +1448,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
}
static inline struct ring_buffer_event *
-rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return __rb_page_index(cpu_buffer->head_page,
- cpu_buffer->head_page->read);
-}
-
-static inline struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned rb_page_write(struct buffer_page *bpage)
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
{
- return local_read(&bpage->write);
+ return local_read(&bpage->write) & RB_WRITE_MASK;
}
static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -970,6 +1463,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
return local_read(&bpage->page->commit);
}
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
/* Size is determined by what has been commited */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
@@ -982,22 +1480,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
return rb_page_commit(cpu_buffer->commit_page);
}
-static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return rb_page_commit(cpu_buffer->head_page);
-}
-
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
-{
- struct list_head *p = (*bpage)->list.next;
-
- if (p == &cpu_buffer->pages)
- p = p->next;
-
- *bpage = list_entry(p, struct buffer_page, list);
-}
-
static inline unsigned
rb_event_index(struct ring_buffer_event *event)
{
@@ -1023,6 +1505,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
static void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
+ unsigned long max_count;
+
/*
* We only race with interrupts and NMIs on this CPU.
* If we own the commit event, then we can commit
@@ -1032,9 +1516,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
+ max_count = cpu_buffer->buffer->pages * 100;
+
while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
@@ -1043,8 +1534,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
}
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
barrier();
}
@@ -1077,7 +1572,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* to the head page instead of next.
*/
if (iter->head_page == cpu_buffer->reader_page)
- iter->head_page = cpu_buffer->head_page;
+ iter->head_page = rb_set_head_page(cpu_buffer);
else
rb_inc_page(cpu_buffer, &iter->head_page);
@@ -1085,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
+
+ return skip_time_extend(event);
+}
+
/**
* ring_buffer_update_event - update event type and data
* @event: the even to update
@@ -1097,28 +1611,188 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* data field.
*/
static void
-rb_update_event(struct ring_buffer_event *event,
- unsigned type, unsigned length)
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, unsigned length,
+ int add_timestamp, u64 delta)
{
- event->type_len = type;
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
+
+ /*
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
+ */
+ if (unlikely(add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
+ delta = 0;
+ }
+
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
+
+/*
+ * rb_handle_head_page - writer hit the head page
+ *
+ * Returns: +1 to retry page
+ * 0 to continue
+ * -1 on error
+ */
+static int
+rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *new_head;
+ int entries;
+ int type;
+ int ret;
+
+ entries = rb_page_entries(next_page);
+
+ /*
+ * The hard part is here. We need to move the head
+ * forward, and protect against both readers on
+ * other CPUs and writers coming in via interrupts.
+ */
+ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
+ RB_PAGE_HEAD);
+
+ /*
+ * type can be one of four:
+ * NORMAL - an interrupt already moved it for us
+ * HEAD - we are the first to get here.
+ * UPDATE - we are the interrupt interrupting
+ * a current move.
+ * MOVED - a reader on another CPU moved the next
+ * pointer to its reader page. Give up
+ * and try again.
+ */
switch (type) {
+ case RB_PAGE_HEAD:
+ /*
+ * We changed the head to UPDATE, thus
+ * it is our responsibility to update
+ * the counters.
+ */
+ local_add(entries, &cpu_buffer->overrun);
- case RINGBUF_TYPE_PADDING:
- case RINGBUF_TYPE_TIME_EXTEND:
- case RINGBUF_TYPE_TIME_STAMP:
+ /*
+ * The entries will be zeroed out when we move the
+ * tail page.
+ */
+
+ /* still more to do */
break;
- case 0:
- length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA)
- event->array[0] = length;
- else
- event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+ case RB_PAGE_UPDATE:
+ /*
+ * This is an interrupt that interrupt the
+ * previous update. Still more to do.
+ */
break;
+ case RB_PAGE_NORMAL:
+ /*
+ * An interrupt came in before the update
+ * and processed this for us.
+ * Nothing left to do.
+ */
+ return 1;
+ case RB_PAGE_MOVED:
+ /*
+ * The reader is on another CPU and just did
+ * a swap with our next_page.
+ * Try again.
+ */
+ return 1;
default:
- BUG();
+ RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
+ return -1;
+ }
+
+ /*
+ * Now that we are here, the old head pointer is
+ * set to UPDATE. This will keep the reader from
+ * swapping the head page with the reader page.
+ * The reader (on another CPU) will spin till
+ * we are finished.
+ *
+ * We just need to protect against interrupts
+ * doing the job. We will set the next pointer
+ * to HEAD. After that, we set the old pointer
+ * to NORMAL, but only if it was HEAD before.
+ * otherwise we are an interrupt, and only
+ * want the outer most commit to reset it.
+ */
+ new_head = next_page;
+ rb_inc_page(cpu_buffer, &new_head);
+
+ ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
+ RB_PAGE_NORMAL);
+
+ /*
+ * Valid returns are:
+ * HEAD - an interrupt came in and already set it.
+ * NORMAL - One of two things:
+ * 1) We really set it.
+ * 2) A bunch of interrupts came in and moved
+ * the page forward again.
+ */
+ switch (ret) {
+ case RB_PAGE_HEAD:
+ case RB_PAGE_NORMAL:
+ /* OK */
+ break;
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ return -1;
}
+
+ /*
+ * It is possible that an interrupt came in,
+ * set the head up, then more interrupts came in
+ * and moved it again. When we get back here,
+ * the page would have been set to NORMAL but we
+ * just set it back to HEAD.
+ *
+ * How do you detect this? Well, if that happened
+ * the tail page would have moved.
+ */
+ if (ret == RB_PAGE_NORMAL) {
+ /*
+ * If the tail had moved passed next, then we need
+ * to reset the pointer.
+ */
+ if (cpu_buffer->tail_page != tail_page &&
+ cpu_buffer->tail_page != next_page)
+ rb_head_page_set_normal(cpu_buffer, new_head,
+ next_page,
+ RB_PAGE_HEAD);
+ }
+
+ /*
+ * If this was the outer most commit (the one that
+ * changed the original pointer from HEAD to UPDATE),
+ * then it is up to us to reset it to NORMAL.
+ */
+ if (type == RB_PAGE_HEAD) {
+ ret = rb_head_page_set_normal(cpu_buffer, next_page,
+ tail_page,
+ RB_PAGE_UPDATE);
+ if (RB_WARN_ON(cpu_buffer,
+ ret != RB_PAGE_UPDATE))
+ return -1;
+ }
+
+ return 0;
}
static unsigned rb_calculate_event_length(unsigned length)
@@ -1129,11 +1803,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length = 1;
- if (length > RB_MAX_SMALL_DATA)
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ALIGNMENT);
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
return length;
}
@@ -1150,6 +1824,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* must fill the old tail_page with padding.
*/
if (tail >= BUF_PAGE_SIZE) {
+ /*
+ * If the page was filled, then we still need
+ * to update the real_end. Reset it to zero
+ * and the reader will ignore it.
+ */
+ if (tail == BUF_PAGE_SIZE)
+ tail_page->real_end = 0;
+
local_sub(length, &tail_page->write);
return;
}
@@ -1158,6 +1840,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
kmemcheck_annotate_bitfield(event, bitfield);
/*
+ * Save the original length to the meta data.
+ * This will be used by the reader to add lost event
+ * counter.
+ */
+ tail_page->real_end = tail;
+
+ /*
* If this event is bigger than the minimum size, then
* we need to be careful that we don't subtract the
* write counter enough to allow another writer to slip
@@ -1184,111 +1873,108 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
- /* Account for this as an entry */
- local_inc(&tail_page->entries);
- local_inc(&cpu_buffer->entries);
/* Set write to end of buffer */
length = (tail + length) - BUF_PAGE_SIZE;
local_sub(length, &tail_page->write);
}
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length, unsigned long tail,
- struct buffer_page *commit_page,
- struct buffer_page *tail_page, u64 *ts)
+ struct buffer_page *tail_page, u64 ts)
{
- struct buffer_page *next_page, *head_page, *reader_page;
+ struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
- bool lock_taken = false;
- unsigned long flags;
+ struct buffer_page *next_page;
+ int ret;
next_page = tail_page;
- local_irq_save(flags);
- /*
- * Since the write to the buffer is still not
- * fully lockless, we must be careful with NMIs.
- * The locks in the writers are taken when a write
- * crosses to a new page. The locks protect against
- * races with the readers (this will soon be fixed
- * with a lockless solution).
- *
- * Because we can not protect against NMIs, and we
- * want to keep traces reentrant, we need to manage
- * what happens when we are in an NMI.
- *
- * NMIs can happen after we take the lock.
- * If we are in an NMI, only take the lock
- * if it is not already taken. Otherwise
- * simply fail.
- */
- if (unlikely(in_nmi())) {
- if (!__raw_spin_trylock(&cpu_buffer->lock)) {
- cpu_buffer->nmi_dropped++;
- goto out_reset;
- }
- } else
- __raw_spin_lock(&cpu_buffer->lock);
-
- lock_taken = true;
-
rb_inc_page(cpu_buffer, &next_page);
- head_page = cpu_buffer->head_page;
- reader_page = cpu_buffer->reader_page;
-
- /* we grabbed the lock before incrementing */
- if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
- goto out_reset;
-
/*
* If for some reason, we had an interrupt storm that made
* it all the way around the buffer, bail, and warn
* about it.
*/
if (unlikely(next_page == commit_page)) {
- cpu_buffer->commit_overrun++;
+ local_inc(&cpu_buffer->commit_overrun);
goto out_reset;
}
- if (next_page == head_page) {
- if (!(buffer->flags & RB_FL_OVERWRITE))
- goto out_reset;
-
- /* tail_page has not moved yet? */
- if (tail_page == cpu_buffer->tail_page) {
- /* count overflows */
- cpu_buffer->overrun +=
- local_read(&head_page->entries);
+ /*
+ * This is where the fun begins!
+ *
+ * We are fighting against races between a reader that
+ * could be on another CPU trying to swap its reader
+ * page with the buffer head.
+ *
+ * We are also fighting against interrupts coming in and
+ * moving the head or tail on us as well.
+ *
+ * If the next page is the head page then we have filled
+ * the buffer, unless the commit page is still on the
+ * reader page.
+ */
+ if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
- rb_inc_page(cpu_buffer, &head_page);
- cpu_buffer->head_page = head_page;
- cpu_buffer->head_page->read = 0;
+ /*
+ * If the commit is not on the reader page, then
+ * move the header page.
+ */
+ if (!rb_is_reader_page(cpu_buffer->commit_page)) {
+ /*
+ * If we are not in overwrite mode,
+ * this is easy, just stop here.
+ */
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ goto out_reset;
+
+ ret = rb_handle_head_page(cpu_buffer,
+ tail_page,
+ next_page);
+ if (ret < 0)
+ goto out_reset;
+ if (ret)
+ goto out_again;
+ } else {
+ /*
+ * We need to be careful here too. The
+ * commit page could still be on the reader
+ * page. We could have a small buffer, and
+ * have filled up the buffer with events
+ * from interrupts and such, and wrapped.
+ *
+ * Note, if the tail page is also the on the
+ * reader_page, we let it move out.
+ */
+ if (unlikely((cpu_buffer->commit_page !=
+ cpu_buffer->tail_page) &&
+ (cpu_buffer->commit_page ==
+ cpu_buffer->reader_page))) {
+ local_inc(&cpu_buffer->commit_overrun);
+ goto out_reset;
+ }
}
}
- /*
- * If the tail page is still the same as what we think
- * it is, then it is up to us to update the tail
- * pointer.
- */
- if (tail_page == cpu_buffer->tail_page) {
- local_set(&next_page->write, 0);
- local_set(&next_page->entries, 0);
- local_set(&next_page->page->commit, 0);
- cpu_buffer->tail_page = next_page;
-
- /* reread the time stamp */
- *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
- cpu_buffer->tail_page->page->time_stamp = *ts;
+ ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
+ if (ret) {
+ /*
+ * Nested commits always have zero deltas, so
+ * just reread the time stamp
+ */
+ ts = rb_time_stamp(buffer);
+ next_page->page->time_stamp = ts;
}
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ out_again:
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -1297,48 +1983,52 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* reset write */
rb_reset_tail(cpu_buffer, tail_page, tail, length);
- if (likely(lock_taken))
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
return NULL;
}
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length, u64 *ts)
+ unsigned long length, u64 ts,
+ u64 delta, int add_timestamp)
{
- struct buffer_page *tail_page, *commit_page;
+ struct buffer_page *tail_page;
struct ring_buffer_event *event;
unsigned long tail, write;
- commit_page = cpu_buffer->commit_page;
- /* we just need to protect against interrupts */
- barrier();
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(add_timestamp))
+ length += RB_LEN_TIME_EXTEND;
+
tail_page = cpu_buffer->tail_page;
write = local_add_return(length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
tail = write - length;
/* See if we shot pass the end of this buffer page */
- if (write > BUF_PAGE_SIZE)
+ if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail,
- commit_page, tail_page, ts);
+ tail_page, ts);
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
kmemcheck_annotate_bitfield(event, bitfield);
- rb_update_event(event, type, length);
+ rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
- /* The passed in type is zero for DATA */
- if (likely(!type))
- local_inc(&tail_page->entries);
+ local_inc(&tail_page->entries);
/*
* If this is the first commit on the page, then update
* its timestamp.
*/
if (!tail)
- tail_page->page->time_stamp = *ts;
+ tail_page->page->time_stamp = ts;
return event;
}
@@ -1353,19 +2043,23 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long addr;
new_index = rb_event_index(event);
- old_index = new_index + rb_event_length(event);
+ old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
addr &= PAGE_MASK;
bpage = cpu_buffer->tail_page;
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+ unsigned long write_mask =
+ local_read(&bpage->write) & ~RB_WRITE_MASK;
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
* and write to the next page. That is fine
* because we just shorten what is on this page.
*/
+ old_index += write_mask;
+ new_index += write_mask;
index = local_cmpxchg(&bpage->write, old_index, new_index);
if (index == old_index)
return 1;
@@ -1375,80 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- u64 *ts, u64 *delta)
-{
- struct ring_buffer_event *event;
- static int once;
- int ret;
-
- if (unlikely(*delta > (1ULL << 59) && !once++)) {
- printk(KERN_WARNING "Delta way too big! %llu"
- " ts=%llu write stamp = %llu\n",
- (unsigned long long)*delta,
- (unsigned long long)*ts,
- (unsigned long long)cpu_buffer->write_stamp);
- WARN_ON(1);
- }
-
- /*
- * The delta is too big, we to add a
- * new timestamp.
- */
- event = __rb_reserve_next(cpu_buffer,
- RINGBUF_TYPE_TIME_EXTEND,
- RB_LEN_TIME_EXTEND,
- ts);
- if (!event)
- return -EBUSY;
-
- if (PTR_ERR(event) == -EAGAIN)
- return -EAGAIN;
-
- /* Only a commited time event can update the write stamp */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * If this is the first on the page, then it was
- * updated with the page itself. Try to discard it
- * and if we can't just make it zero.
- */
- if (rb_event_index(event)) {
- event->time_delta = *delta & TS_MASK;
- event->array[0] = *delta >> TS_SHIFT;
- } else {
- /* try to discard, since we do not need this */
- if (!rb_try_to_discard(cpu_buffer, event)) {
- /* nope, just zero it */
- event->time_delta = 0;
- event->array[0] = 0;
- }
- }
- cpu_buffer->write_stamp = *ts;
- /* let the caller know this was the commit */
- ret = 1;
- } else {
- /* Try to discard the event */
- if (!rb_try_to_discard(cpu_buffer, event)) {
- /* Darn, this is just wasted space */
- event->time_delta = 0;
- event->array[0] = 0;
- }
- ret = 0;
- }
-
- *delta = 0;
-
- return ret;
-}
-
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->committing);
local_inc(&cpu_buffer->commits);
}
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -1481,18 +2108,38 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
}
static struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+rb_reserve_next_event(struct ring_buffer *buffer,
+ struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length)
{
struct ring_buffer_event *event;
- u64 ts, delta = 0;
- int commit = 0;
+ u64 ts, delta;
int nr_loops = 0;
+ int add_timestamp;
+ u64 diff;
rb_start_commit(cpu_buffer);
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ /*
+ * Due to the ability to swap a cpu buffer from a buffer
+ * it is possible it was swapped before we committed.
+ * (committing stops a swap). We check for it here and
+ * if it happened, we have to fail the write.
+ */
+ barrier();
+ if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ local_dec(&cpu_buffer->committing);
+ local_dec(&cpu_buffer->commits);
+ return NULL;
+ }
+#endif
+
length = rb_calculate_event_length(length);
again:
+ add_timestamp = 0;
+ delta = 0;
+
/*
* We allow for interrupts to reenter here and do a trace.
* If one does, it will cause this original code to loop
@@ -1505,57 +2152,33 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
goto out_fail;
- ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+ ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = ts - cpu_buffer->write_stamp;
- /*
- * Only the first commit can update the timestamp.
- * Yes there is a race here. If an interrupt comes in
- * just after the conditional and it traces too, then it
- * will also check the deltas. More than one timestamp may
- * also be made. But only the entry that did the actual
- * commit will be something other than zero.
- */
- if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
- rb_page_write(cpu_buffer->tail_page) ==
- rb_commit_index(cpu_buffer))) {
- u64 diff;
-
- diff = ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- /* Did the write stamp get updated already? */
- if (unlikely(ts < cpu_buffer->write_stamp))
- goto get_event;
+ /* make sure this diff is calculated here */
+ barrier();
+ /* Did the write stamp get updated already? */
+ if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
-
- commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
- if (commit == -EBUSY)
- goto out_fail;
-
- if (commit == -EAGAIN)
- goto again;
-
- RB_WARN_ON(cpu_buffer, commit < 0);
+ WARN_ONCE(delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+ (unsigned long long)delta,
+ (unsigned long long)ts,
+ (unsigned long long)cpu_buffer->write_stamp);
+ add_timestamp = 1;
}
}
- get_event:
- event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+ event = __rb_reserve_next(cpu_buffer, length, ts,
+ delta, add_timestamp);
if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;
if (!event)
goto out_fail;
- if (!rb_event_is_commit(cpu_buffer, event))
- delta = 0;
-
- event->time_delta = delta;
-
return event;
out_fail:
@@ -1563,15 +2186,13 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
}
+#ifdef CONFIG_TRACING
+
#define TRACE_RECURSIVE_DEPTH 16
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
{
- current->trace_recursion++;
-
- if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
- return 0;
-
/* Disable all tracing before we do anything else */
tracing_off_permanent();
@@ -1583,17 +2204,33 @@ static int trace_recursive_lock(void)
in_nmi());
WARN_ON_ONCE(1);
+}
+
+static inline int trace_recursive_lock(void)
+{
+ current->trace_recursion++;
+
+ if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ return 0;
+
+ trace_recursive_fail();
+
return -1;
}
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
{
WARN_ON_ONCE(!current->trace_recursion);
current->trace_recursion--;
}
-static DEFINE_PER_CPU(int, rb_need_resched);
+#else
+
+#define trace_recursive_lock() (0)
+#define trace_recursive_unlock() do { } while (0)
+
+#endif
/**
* ring_buffer_lock_reserve - reserve a part of the buffer
@@ -1615,16 +2252,16 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
- int cpu, resched;
+ int cpu;
if (ring_buffer_flags != RB_BUFFERS_ON)
return NULL;
- if (atomic_read(&buffer->record_disabled))
- return NULL;
-
/* If we are tracing schedule, we don't want to recurse */
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
+
+ if (atomic_read(&buffer->record_disabled))
+ goto out_nocheck;
if (trace_recursive_lock())
goto out_nocheck;
@@ -1642,41 +2279,54 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
if (length > BUF_MAX_DATA_SIZE)
goto out;
- event = rb_reserve_next_event(cpu_buffer, length);
+ event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
goto out;
- /*
- * Need to store resched state on this cpu.
- * Only the first needs to.
- */
-
- if (preempt_count() == 1)
- per_cpu(rb_need_resched, cpu) = resched;
-
return event;
out:
trace_recursive_unlock();
out_nocheck:
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
return NULL;
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
- local_inc(&cpu_buffer->entries);
+ u64 delta;
/*
* The event first in the commit queue updates the
* time stamp.
*/
- if (rb_event_is_commit(cpu_buffer, event))
- cpu_buffer->write_stamp += event->time_delta;
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
+}
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
rb_end_commit(cpu_buffer);
}
@@ -1701,13 +2351,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
trace_recursive_unlock();
- /*
- * Only the last preempt count needs to restore preemption.
- */
- if (preempt_count() == 1)
- ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
- else
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
return 0;
}
@@ -1715,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static inline void rb_event_discard(struct ring_buffer_event *event)
{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
/* array[0] holds the actual length for the discarded event */
event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
@@ -1723,32 +2370,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-/**
- * ring_buffer_event_discard - discard any event in the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
+/*
+ * Decrement the entries to the page that an event is on.
+ * The event does not even need to exist, only the pointer
+ * to the page it is on. This may only be called before the commit
+ * takes place.
*/
-void ring_buffer_event_discard(struct ring_buffer_event *event)
+static inline void
+rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
{
- rb_event_discard(event);
+ unsigned long addr = (unsigned long)event;
+ struct buffer_page *bpage = cpu_buffer->commit_page;
+ struct buffer_page *start;
+
+ addr &= PAGE_MASK;
+
+ /* Do the likely case first */
+ if (likely(bpage->page == (void *)addr)) {
+ local_dec(&bpage->entries);
+ return;
+ }
+
+ /*
+ * Because the commit page may be on the reader page we
+ * start with the next page and check the end loop there.
+ */
+ rb_inc_page(cpu_buffer, &bpage);
+ start = bpage;
+ do {
+ if (bpage->page == (void *)addr) {
+ local_dec(&bpage->entries);
+ return;
+ }
+ rb_inc_page(cpu_buffer, &bpage);
+ } while (bpage != start);
+
+ /* commit not part of this buffer?? */
+ RB_WARN_ON(cpu_buffer, 1);
}
-EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
/**
* ring_buffer_commit_discard - discard an event that has not been committed
* @buffer: the ring buffer
* @event: non committed event to discard
*
- * This is similar to ring_buffer_event_discard but must only be
- * performed on an event that has not been committed yet. The difference
- * is that this will also try to free the event from the ring buffer
+ * Sometimes an event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * This function only works if it is called before the the item has been
+ * committed. It will try to free the event from the ring buffer
* if another event has not been added behind it.
*
* If another event has been added behind it, it will set the event
@@ -1776,26 +2448,21 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
*/
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
- if (!rb_try_to_discard(cpu_buffer, event))
+ rb_decrement_entry(cpu_buffer, event);
+ if (rb_try_to_discard(cpu_buffer, event))
goto out;
/*
* The commit is still visible by the reader, so we
- * must increment entries.
+ * must still update the timestamp.
*/
- local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
out:
rb_end_commit(cpu_buffer);
trace_recursive_unlock();
- /*
- * Only the last preempt count needs to restore preemption.
- */
- if (preempt_count() == 1)
- ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
- else
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
}
EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -1821,15 +2488,15 @@ int ring_buffer_write(struct ring_buffer *buffer,
struct ring_buffer_event *event;
void *body;
int ret = -EBUSY;
- int cpu, resched;
+ int cpu;
if (ring_buffer_flags != RB_BUFFERS_ON)
return -EBUSY;
- if (atomic_read(&buffer->record_disabled))
- return -EBUSY;
+ preempt_disable_notrace();
- resched = ftrace_preempt_disable();
+ if (atomic_read(&buffer->record_disabled))
+ goto out;
cpu = raw_smp_processor_id();
@@ -1844,7 +2511,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
if (length > BUF_MAX_DATA_SIZE)
goto out;
- event = rb_reserve_next_event(cpu_buffer, length);
+ event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
goto out;
@@ -1856,7 +2523,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
ret = 0;
out:
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
return ret;
}
@@ -1865,9 +2532,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = cpu_buffer->head_page;
+ struct buffer_page *head = rb_set_head_page(cpu_buffer);
struct buffer_page *commit = cpu_buffer->commit_page;
+ /* In case of error, head will be NULL */
+ if (unlikely(!head))
+ return 1;
+
return reader->read == rb_page_commit(reader) &&
(commit == reader ||
(commit == head &&
@@ -1894,7 +2565,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
* @buffer: The ring buffer to enable writes
*
* Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
*/
void ring_buffer_record_enable(struct ring_buffer *buffer)
{
@@ -1930,7 +2601,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
* @cpu: The CPU to enable.
*
* Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
*/
void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
{
@@ -1944,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return local_read(&cpu_buffer->entries) -
+ (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
+
/**
* ring_buffer_entries_cpu - get the number of entries in a cpu buffer
* @buffer: The ring buffer
@@ -1952,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
- - cpu_buffer->read;
- return ret;
+ return rb_num_of_entries(cpu_buffer);
}
EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
@@ -1979,33 +2660,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->overrun;
+ ret = local_read(&cpu_buffer->overrun);
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
/**
- * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
- * @buffer: The ring buffer
- * @cpu: The per CPU buffer to get the number of overruns from
- */
-unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long ret;
-
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return 0;
-
- cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->nmi_dropped;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
-
-/**
* ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
@@ -2020,7 +2681,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
return 0;
cpu_buffer = buffer->buffers[cpu];
- ret = cpu_buffer->commit_overrun;
+ ret = local_read(&cpu_buffer->commit_overrun);
return ret;
}
@@ -2042,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- entries += (local_read(&cpu_buffer->entries) -
- cpu_buffer->overrun) - cpu_buffer->read;
+ entries += rb_num_of_entries(cpu_buffer);
}
return entries;
@@ -2051,7 +2711,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_entries);
/**
- * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * ring_buffer_overruns - get the number of overruns in buffer
* @buffer: The ring buffer
*
* Returns the total number of overruns in the ring buffer
@@ -2066,7 +2726,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- overruns += cpu_buffer->overrun;
+ overruns += local_read(&cpu_buffer->overrun);
}
return overruns;
@@ -2079,8 +2739,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
/* Iterator usage is expected to have record disabled */
if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = cpu_buffer->head_page;
- iter->head = cpu_buffer->head_page->read;
+ iter->head_page = rb_set_head_page(cpu_buffer);
+ if (unlikely(!iter->head_page))
+ return;
+ iter->head = iter->head_page->read;
} else {
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
@@ -2089,6 +2751,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->read_stamp = cpu_buffer->read_stamp;
else
iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->cache_reader_page = cpu_buffer->reader_page;
+ iter->cache_read = cpu_buffer->read;
}
/**
@@ -2195,11 +2859,13 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
+ int ret;
local_irq_save(flags);
- __raw_spin_lock(&cpu_buffer->lock);
+ arch_spin_lock(&cpu_buffer->lock);
again:
/*
@@ -2230,39 +2896,83 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
goto out;
/*
- * Splice the empty reader page into the list around the head.
* Reset the reader page to size zero.
*/
-
- reader = cpu_buffer->head_page;
- cpu_buffer->reader_page->list.next = reader->list.next;
- cpu_buffer->reader_page->list.prev = reader->list.prev;
-
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
+ cpu_buffer->reader_page->real_end = 0;
+
+ spin:
+ /*
+ * Splice the empty reader page into the list around the head.
+ */
+ reader = rb_set_head_page(cpu_buffer);
+ cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
+ cpu_buffer->reader_page->list.prev = reader->list.prev;
+
+ /*
+ * cpu_buffer->pages just needs to point to the buffer, it
+ * has no specific buffer page to point to. Lets move it out
+ * of our way so we don't accidently swap it.
+ */
+ cpu_buffer->pages = reader->list.prev;
- /* Make the reader page now replace the head */
- reader->list.prev->next = &cpu_buffer->reader_page->list;
- reader->list.next->prev = &cpu_buffer->reader_page->list;
+ /* The reader page will be pointing to the new head */
+ rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
/*
- * If the tail is on the reader, then we must set the head
- * to the inserted page, otherwise we set it one before.
+ * We want to make sure we read the overruns after we set up our
+ * pointers to the next object. The writer side does a
+ * cmpxchg to cross pages which acts as the mb on the writer
+ * side. Note, the reader will constantly fail the swap
+ * while the writer is updating the pointers, so this
+ * guarantees that the overwrite recorded here is the one we
+ * want to compare with the last_overrun.
*/
- cpu_buffer->head_page = cpu_buffer->reader_page;
+ smp_mb();
+ overwrite = local_read(&(cpu_buffer->overrun));
- if (cpu_buffer->commit_page != reader)
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ /*
+ * Here's the tricky part.
+ *
+ * We need to move the pointer past the header page.
+ * But we can only do that if a writer is not currently
+ * moving it. The page before the header page has the
+ * flag bit '1' set if it is pointing to the page we want.
+ * but if the writer is in the process of moving it
+ * than it will be '2' or already moved '0'.
+ */
+
+ ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
+
+ /*
+ * If we did not convert it, then we must try again.
+ */
+ if (!ret)
+ goto spin;
+
+ /*
+ * Yeah! We succeeded in replacing the page.
+ *
+ * Now make the new head point back to the reader page.
+ */
+ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
/* Finally update the reader page to the new head */
cpu_buffer->reader_page = reader;
rb_reset_reader_page(cpu_buffer);
+ if (overwrite != cpu_buffer->last_overrun) {
+ cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+ cpu_buffer->last_overrun = overwrite;
+ }
+
goto again;
out:
- __raw_spin_unlock(&cpu_buffer->lock);
+ arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
return reader;
@@ -2282,8 +2992,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
event = rb_reader_event(cpu_buffer);
- if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
- || rb_discarded_event(event))
+ if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
cpu_buffer->read++;
rb_update_read_stamp(cpu_buffer, event);
@@ -2294,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
static void rb_advance_iter(struct ring_buffer_iter *iter)
{
- struct ring_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
unsigned length;
cpu_buffer = iter->cpu_buffer;
- buffer = cpu_buffer->buffer;
/*
* Check if we are at the end of the buffer.
@@ -2336,24 +3043,27 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
rb_advance_iter(iter);
}
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->lost_events;
+}
+
static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+ unsigned long *lost_events)
{
- struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
struct buffer_page *reader;
int nr_loops = 0;
- cpu_buffer = buffer->buffers[cpu];
-
again:
/*
- * We repeat when a timestamp is encountered. It is possible
- * to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written, or from discarded
- * commits. The most that we can have is the number on a single page.
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
@@ -2374,7 +3084,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
* the box. Return the padding, and we will release
* the current locks, and try again.
*/
- rb_advance_reader(cpu_buffer);
return event;
case RINGBUF_TYPE_TIME_EXTEND:
@@ -2390,9 +3099,11 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
case RINGBUF_TYPE_DATA:
if (ts) {
*ts = cpu_buffer->read_stamp + event->time_delta;
- ring_buffer_normalize_time_stamp(buffer,
+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
+ if (lost_events)
+ *lost_events = rb_lost_events(cpu_buffer);
return event;
default:
@@ -2411,27 +3122,39 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *event;
int nr_loops = 0;
- if (ring_buffer_iter_empty(iter))
- return NULL;
-
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
+ /*
+ * Check if someone performed a consuming read to
+ * the buffer. A consuming read invalidates the iterator
+ * and we need to reset the iterator in this case.
+ */
+ if (unlikely(iter->cache_read != cpu_buffer->read ||
+ iter->cache_reader_page != cpu_buffer->reader_page))
+ rb_iter_reset(iter);
+
again:
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
/*
- * We repeat when a timestamp is encountered.
- * We can get multiple timestamps by nested interrupts or also
- * if filtering is on (discarding commits). Since discarding
- * commits can be frequent we can get a lot of timestamps.
- * But we limit them by not adding timestamps if they begin
- * at the start of a page.
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
+ if (iter->head >= local_read(&iter->head_page->page->commit)) {
+ rb_inc_iter(iter);
+ goto again;
+ }
+
event = rb_iter_head_event(iter);
switch (event->type_len) {
@@ -2477,7 +3200,7 @@ static inline int rb_ok_to_lock(void)
* buffer too. A one time deal is all you get from reading
* the ring buffer from an NMI.
*/
- if (likely(!in_nmi() && !oops_in_progress))
+ if (likely(!in_nmi()))
return 1;
tracing_off_permanent();
@@ -2489,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
* @buffer: The ring buffer to read
* @cpu: The cpu to peak at
* @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
*
* This will return the event that will be read next, but does
* not consume the data.
*/
struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -2509,15 +3234,15 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
local_irq_save(flags);
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
- event = rb_buffer_peek(buffer, cpu, ts);
+ event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ rb_advance_reader(cpu_buffer);
if (dolock)
spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
- if (event && event->type_len == RINGBUF_TYPE_PADDING) {
- cpu_relax();
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
- }
return event;
}
@@ -2542,10 +3267,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
event = rb_iter_peek(iter, ts);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (event && event->type_len == RINGBUF_TYPE_PADDING) {
- cpu_relax();
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
- }
return event;
}
@@ -2553,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
/**
* ring_buffer_consume - return an event and consume it
* @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
*
* Returns the next event in the ring buffer, and that event is consumed.
* Meaning, that sequential reads will keep returning a different event,
* and eventually empty the ring buffer if the producer is slower.
*/
struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
@@ -2580,13 +3307,12 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
- event = rb_buffer_peek(buffer, cpu, ts);
- if (!event)
- goto out_unlock;
-
- rb_advance_reader(cpu_buffer);
+ event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+ if (event) {
+ cpu_buffer->lost_events = 0;
+ rb_advance_reader(cpu_buffer);
+ }
- out_unlock:
if (dolock)
spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
@@ -2594,33 +3320,38 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
out:
preempt_enable();
- if (event && event->type_len == RINGBUF_TYPE_PADDING) {
- cpu_relax();
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
- }
return event;
}
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
*
- * This starts up an iteration through the buffer. It also disables
- * the recording to the buffer until the reading is finished.
- * This prevents the reading from being corrupted. This is not
- * a consuming read, so a producer is not expected.
+ * This performs the initial preparations necessary to iterate
+ * through the buffer. Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
*
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
- unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
@@ -2634,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
iter->cpu_buffer = cpu_buffer;
atomic_inc(&cpu_buffer->record_disabled);
+
+ return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized. Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ if (!iter)
+ return;
+
+ cpu_buffer = iter->cpu_buffer;
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- __raw_spin_lock(&cpu_buffer->lock);
+ arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
- __raw_spin_unlock(&cpu_buffer->lock);
+ arch_spin_unlock(&cpu_buffer->lock);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2677,21 +3445,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
- again:
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ again:
event = rb_iter_peek(iter, ts);
if (!event)
goto out;
+ if (event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
rb_advance_iter(iter);
out:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (event && event->type_len == RINGBUF_TYPE_PADDING) {
- cpu_relax();
- goto again;
- }
-
return event;
}
EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2709,8 +3475,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
+ rb_head_page_deactivate(cpu_buffer);
+
cpu_buffer->head_page
- = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
local_set(&cpu_buffer->head_page->write, 0);
local_set(&cpu_buffer->head_page->entries, 0);
local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2726,16 +3494,20 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->read = 0;
- cpu_buffer->nmi_dropped = 0;
- cpu_buffer->commit_overrun = 0;
- cpu_buffer->overrun = 0;
- cpu_buffer->read = 0;
+ local_set(&cpu_buffer->commit_overrun, 0);
+ local_set(&cpu_buffer->overrun, 0);
local_set(&cpu_buffer->entries, 0);
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
+ cpu_buffer->read = 0;
cpu_buffer->write_stamp = 0;
cpu_buffer->read_stamp = 0;
+
+ cpu_buffer->lost_events = 0;
+ cpu_buffer->last_overrun = 0;
+
+ rb_head_page_activate(cpu_buffer);
}
/**
@@ -2755,12 +3527,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- __raw_spin_lock(&cpu_buffer->lock);
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
+ arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
- __raw_spin_unlock(&cpu_buffer->lock);
+ arch_spin_unlock(&cpu_buffer->lock);
+ out:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->record_disabled);
@@ -2843,6 +3619,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
* @buffer_a: One buffer to swap with
@@ -2897,20 +3674,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
atomic_inc(&cpu_buffer_a->record_disabled);
atomic_inc(&cpu_buffer_b->record_disabled);
+ ret = -EBUSY;
+ if (local_read(&cpu_buffer_a->committing))
+ goto out_dec;
+ if (local_read(&cpu_buffer_b->committing))
+ goto out_dec;
+
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
cpu_buffer_b->buffer = buffer_a;
cpu_buffer_a->buffer = buffer_b;
+ ret = 0;
+
+out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-
- ret = 0;
out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
+#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
/**
* ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -2997,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
struct ring_buffer_event *event;
struct buffer_data_page *bpage;
struct buffer_page *reader;
+ unsigned long missed_events;
unsigned long flags;
unsigned int commit;
unsigned int read;
@@ -3033,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
read = reader->read;
commit = rb_page_commit(reader);
+ /* Check if any events were dropped */
+ missed_events = cpu_buffer->lost_events;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -3053,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
if (len > (commit - read))
len = (commit - read);
- size = rb_event_length(event);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
if (len < size)
goto out_unlock;
@@ -3063,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
/* Need to copy one event at a time */
do {
+ /* We need the size of one event, because
+ * rb_advance_reader only advances by one event,
+ * whereas rb_event_ts_length may include the size of
+ * one or two events.
+ * We have already ensured there's enough space if this
+ * is a time extend. */
+ size = rb_event_length(event);
memcpy(bpage->data + pos, rpage->data + rpos, size);
len -= size;
@@ -3071,9 +3868,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
rpos = reader->read;
pos += size;
+ if (rpos >= commit)
+ break;
+
event = rb_reader_event(cpu_buffer);
- size = rb_event_length(event);
- } while (len > size);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
+ } while (len >= size);
/* update bpage */
local_set(&bpage->commit, pos);
@@ -3083,7 +3884,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
read = 0;
} else {
/* update the entry counter */
- cpu_buffer->read += local_read(&reader->entries);
+ cpu_buffer->read += rb_page_entries(reader);
/* swap the pages */
rb_init_page(bpage);
@@ -3093,9 +3894,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
local_set(&reader->entries, 0);
reader->read = 0;
*data_page = bpage;
+
+ /*
+ * Use the real_end for the data size,
+ * This gives us a chance to store the lost events
+ * on the page.
+ */
+ if (reader->real_end)
+ local_set(&bpage->commit, reader->real_end);
}
ret = read;
+ cpu_buffer->lost_events = 0;
+
+ commit = local_read(&bpage->commit);
+ /*
+ * Set a flag in the commit field if we lost events
+ */
+ if (missed_events) {
+ /* If there is room at the end of the page to save the
+ * missed events, then record it there.
+ */
+ if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ memcpy(&bpage->data[commit], &missed_events,
+ sizeof(missed_events));
+ local_add(RB_MISSED_STORED, &bpage->commit);
+ commit += sizeof(missed_events);
+ }
+ local_add(RB_MISSED_EVENTS, &bpage->commit);
+ }
+
+ /*
+ * This page may be off to user land. Zero it out here.
+ */
+ if (commit < BUF_PAGE_SIZE)
+ memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+
out_unlock:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -3104,6 +3938,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#ifdef CONFIG_TRACING
static ssize_t
rb_simple_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -3155,6 +3990,7 @@ static const struct file_operations rb_simple_fops = {
.open = tracing_open_generic,
.read = rb_simple_read,
.write = rb_simple_write,
+ .llseek = default_llseek,
};
@@ -3171,6 +4007,7 @@ static __init int rb_init_debugfs(void)
}
fs_initcall(rb_init_debugfs);
+#endif
#ifdef CONFIG_HOTPLUG_CPU
static int rb_cpu_notify(struct notifier_block *self,
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c..302f8a61463 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/time.h>
+#include <asm/local.h>
struct rb_page {
u64 ts;
@@ -35,6 +36,28 @@ static int disable_reader;
module_param(disable_reader, uint, 0644);
MODULE_PARM_DESC(disable_reader, "only run producer");
+static int write_iteration = 50;
+module_param(write_iteration, uint, 0644);
+MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
+
+static int producer_nice = 19;
+static int consumer_nice = 19;
+
+static int producer_fifo = -1;
+static int consumer_fifo = -1;
+
+module_param(producer_nice, uint, 0644);
+MODULE_PARM_DESC(producer_nice, "nice prio for producer");
+
+module_param(consumer_nice, uint, 0644);
+MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
+
+module_param(producer_fifo, uint, 0644);
+MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+
+module_param(consumer_fifo, uint, 0644);
+MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+
static int read_events;
static int kill_test;
@@ -58,7 +81,7 @@ static enum event_status read_event(int cpu)
int *entry;
u64 ts;
- event = ring_buffer_consume(buffer, cpu, &ts);
+ event = ring_buffer_consume(buffer, cpu, &ts, NULL);
if (!event)
return EVENT_DROPPED;
@@ -90,7 +113,8 @@ static enum event_status read_page(int cpu)
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
if (ret >= 0) {
rpage = bpage;
- commit = local_read(&rpage->commit);
+ /* The commit may have missed event flags set, clear them */
+ commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !kill_test; i += inc) {
if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
@@ -208,15 +232,18 @@ static void ring_buffer_producer(void)
do {
struct ring_buffer_event *event;
int *entry;
-
- event = ring_buffer_lock_reserve(buffer, 10);
- if (!event) {
- missed++;
- } else {
- hit++;
- entry = ring_buffer_event_data(event);
- *entry = smp_processor_id();
- ring_buffer_unlock_commit(buffer, event);
+ int i;
+
+ for (i = 0; i < write_iteration; i++) {
+ event = ring_buffer_lock_reserve(buffer, 10);
+ if (!event) {
+ missed++;
+ } else {
+ hit++;
+ entry = ring_buffer_event_data(event);
+ *entry = smp_processor_id();
+ ring_buffer_unlock_commit(buffer, event);
+ }
}
do_gettimeofday(&end_tv);
@@ -263,6 +290,27 @@ static void ring_buffer_producer(void)
if (kill_test)
trace_printk("ERROR!\n");
+
+ if (!disable_reader) {
+ if (consumer_fifo < 0)
+ trace_printk("Running Consumer at nice: %d\n",
+ consumer_nice);
+ else
+ trace_printk("Running Consumer at SCHED_FIFO %d\n",
+ consumer_fifo);
+ }
+ if (producer_fifo < 0)
+ trace_printk("Running Producer at nice: %d\n",
+ producer_nice);
+ else
+ trace_printk("Running Producer at SCHED_FIFO %d\n",
+ producer_fifo);
+
+ /* Let the user know that the test is running at low priority */
+ if (producer_fifo < 0 && consumer_fifo < 0 &&
+ producer_nice == 19 && consumer_nice == 19)
+ trace_printk("WARNING!!! This test is running at lowest priority.\n");
+
trace_printk("Time: %lld (usecs)\n", time);
trace_printk("Overruns: %lld\n", overruns);
if (disable_reader)
@@ -392,6 +440,27 @@ static int __init ring_buffer_benchmark_init(void)
if (IS_ERR(producer))
goto out_kill;
+ /*
+ * Run them as low-prio background tasks by default:
+ */
+ if (!disable_reader) {
+ if (consumer_fifo >= 0) {
+ struct sched_param param = {
+ .sched_priority = consumer_fifo
+ };
+ sched_setscheduler(consumer, SCHED_FIFO, &param);
+ } else
+ set_user_nice(consumer, consumer_nice);
+ }
+
+ if (producer_fifo >= 0) {
+ struct sched_param param = {
+ .sched_priority = consumer_fifo
+ };
+ sched_setscheduler(producer, SCHED_FIFO, &param);
+ } else
+ set_user_nice(producer, producer_nice);
+
return 0;
out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 076fa6f0ee4..dc53ecb8058 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
* Copyright (C) 2004 William Lee Irwin III
*/
#include <linux/ring_buffer.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
@@ -31,10 +31,11 @@
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
-#include <linux/gfp.h>
#include <linux/fs.h>
#include "trace.h"
@@ -42,14 +43,11 @@
#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
-unsigned long __read_mostly tracing_max_latency;
-unsigned long __read_mostly tracing_thresh;
-
/*
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
*/
-static int ring_buffer_expanded;
+int ring_buffer_expanded;
/*
* We need to change this state when a selftest is running.
@@ -63,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
/*
* If a tracer is running, we do not want to run SELFTEST.
*/
-static bool __read_mostly tracing_selftest_disabled;
+bool __read_mostly tracing_selftest_disabled;
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
@@ -88,27 +86,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
*/
static int tracing_disabled = 1;
-static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(int, ftrace_cpu_disabled);
static inline void ftrace_disable_cpu(void)
{
preempt_disable();
- local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+ __this_cpu_inc(ftrace_cpu_disabled);
}
static inline void ftrace_enable_cpu(void)
{
- local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+ __this_cpu_dec(ftrace_cpu_disabled);
preempt_enable();
}
-static cpumask_var_t __read_mostly tracing_buffer_mask;
-
-/* Define which cpu buffers are currently read in trace_pipe */
-static cpumask_var_t tracing_reader_cpumask;
-
-#define for_each_tracing_cpu(cpu) \
- for_each_cpu(cpu, tracing_buffer_mask)
+cpumask_var_t __read_mostly tracing_buffer_mask;
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -121,30 +113,42 @@ static cpumask_var_t tracing_reader_cpumask;
*
* It is default off, but you can enable it with either specifying
* "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
*/
-int ftrace_dump_on_oops;
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
static int tracing_set_tracer(const char *buf);
-#define BOOTUP_TRACER_SIZE 100
-static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+#define MAX_TRACER_SIZE 100
+static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
-static int __init set_ftrace(char *str)
+static int __init set_cmdline_ftrace(char *str)
{
- strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+ strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
ring_buffer_expanded = 1;
return 1;
}
-__setup("ftrace=", set_ftrace);
+__setup("ftrace=", set_cmdline_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
- ftrace_dump_on_oops = 1;
- return 1;
+ if (*str++ != '=' || !*str) {
+ ftrace_dump_on_oops = DUMP_ALL;
+ return 1;
+ }
+
+ if (!strcmp("orig_cpu", str)) {
+ ftrace_dump_on_oops = DUMP_ORIG;
+ return 1;
+ }
+
+ return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -171,10 +175,11 @@ static struct trace_array global_trace;
static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
-int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+int filter_current_check_discard(struct ring_buffer *buffer,
+ struct ftrace_event_call *call, void *rec,
struct ring_buffer_event *event)
{
- return filter_check_discard(call, rec, global_trace.buffer, event);
+ return filter_check_discard(call, rec, buffer, event);
}
EXPORT_SYMBOL_GPL(filter_current_check_discard);
@@ -204,7 +209,7 @@ cycle_t ftrace_now(int cpu)
*/
static struct trace_array max_tr;
-static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
/* tracer_enabled is used to toggle activation of a tracer */
static int tracer_enabled = 1;
@@ -243,19 +248,91 @@ static struct tracer *trace_types __read_mostly;
static struct tracer *current_trace __read_mostly;
/*
- * max_tracer_type_len is used to simplify the allocating of
- * buffers to read userspace tracer names. We keep track of
- * the longest tracer name registered.
+ * trace_types_lock is used to protect the trace_types list.
*/
-static int max_tracer_type_len;
+static DEFINE_MUTEX(trace_types_lock);
/*
- * trace_types_lock is used to protect the trace_types list.
- * This lock is also used to keep user access serialized.
- * Accesses from userspace will grab this lock while userspace
- * activities happen inside the kernel.
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ * A) the page of the consumed events may become a normal page
+ * (not reader page) in ring buffer, and this page will be rewrited
+ * by events producer.
+ * B) The page of the consumed events may become a page for splice_read,
+ * and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
*/
-static DEFINE_MUTEX(trace_types_lock);
+
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+
+static inline void trace_access_lock(int cpu)
+{
+ if (cpu == TRACE_PIPE_ALL_CPU) {
+ /* gain it for accessing the whole ring buffer. */
+ down_write(&all_cpu_access_lock);
+ } else {
+ /* gain it for accessing a cpu ring buffer. */
+
+ /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+ down_read(&all_cpu_access_lock);
+
+ /* Secondly block other access to this @cpu ring buffer. */
+ mutex_lock(&per_cpu(cpu_access_lock, cpu));
+ }
+}
+
+static inline void trace_access_unlock(int cpu)
+{
+ if (cpu == TRACE_PIPE_ALL_CPU) {
+ up_write(&all_cpu_access_lock);
+ } else {
+ mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+ up_read(&all_cpu_access_lock);
+ }
+}
+
+static inline void trace_access_lock_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+
+#else
+
+static DEFINE_MUTEX(access_lock);
+
+static inline void trace_access_lock(int cpu)
+{
+ (void)cpu;
+ mutex_lock(&access_lock);
+}
+
+static inline void trace_access_unlock(int cpu)
+{
+ (void)cpu;
+ mutex_unlock(&access_lock);
+}
+
+static inline void trace_access_lock_init(void)
+{
+}
+
+#endif
/* trace_wait is a waitqueue for tasks blocked on trace_poll */
static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
@@ -263,7 +340,10 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME;
+ TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
/**
* trace_wake_up - wake up tasks waiting for trace input
@@ -273,30 +353,50 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
*/
void trace_wake_up(void)
{
+ int cpu;
+
+ if (trace_flags & TRACE_ITER_BLOCK)
+ return;
/*
* The runqueue_is_locked() can fail, but this is the best we
* have for now:
*/
- if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+ cpu = get_cpu();
+ if (!runqueue_is_locked(cpu))
wake_up(&trace_wait);
+ put_cpu();
}
static int __init set_buf_size(char *str)
{
unsigned long buf_size;
- int ret;
if (!str)
return 0;
- ret = strict_strtoul(str, 0, &buf_size);
+ buf_size = memparse(str, &str);
/* nr_entries can not be zero */
- if (ret < 0 || buf_size == 0)
+ if (buf_size == 0)
return 0;
trace_buf_size = buf_size;
return 1;
}
__setup("trace_buf_size=", set_buf_size);
+static int __init set_tracing_thresh(char *str)
+{
+ unsigned long threshhold;
+ int ret;
+
+ if (!str)
+ return 0;
+ ret = strict_strtoul(str, 0, &threshhold);
+ if (ret < 0)
+ return 0;
+ tracing_thresh = threshhold * 1000;
+ return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
+
unsigned long nsecs_to_usecs(unsigned long nsecs)
{
return nsecs / 1000;
@@ -313,7 +413,6 @@ static const char *trace_options[] = {
"bin",
"block",
"stacktrace",
- "sched-tree",
"trace_printk",
"ftrace_preempt",
"branch",
@@ -323,49 +422,126 @@ static const char *trace_options[] = {
"printk-msg-only",
"context-info",
"latency-format",
- "global-clock",
"sleep-time",
"graph-time",
+ "record-cmd",
NULL
};
+static struct {
+ u64 (*func)(void);
+ const char *name;
+} trace_clocks[] = {
+ { trace_clock_local, "local" },
+ { trace_clock_global, "global" },
+};
+
+int trace_clock_id;
+
/*
- * ftrace_max_lock is used to protect the swapping of buffers
- * when taking a max snapshot. The buffers themselves are
- * protected by per_cpu spinlocks. But the action of the swap
- * needs its own lock.
- *
- * This is defined as a raw_spinlock_t in order to help
- * with performance when lockdep debugging is enabled.
+ * trace_parser_get_init - gets the buffer for trace parser
*/
-static raw_spinlock_t ftrace_max_lock =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+int trace_parser_get_init(struct trace_parser *parser, int size)
+{
+ memset(parser, 0, sizeof(*parser));
+
+ parser->buffer = kmalloc(size, GFP_KERNEL);
+ if (!parser->buffer)
+ return 1;
+
+ parser->size = size;
+ return 0;
+}
/*
- * Copy the new maximum trace into the separate maximum-trace
- * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ * trace_parser_put - frees the buffer for trace parser
*/
-static void
-__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+void trace_parser_put(struct trace_parser *parser)
{
- struct trace_array_cpu *data = tr->data[cpu];
+ kfree(parser->buffer);
+}
- max_tr.cpu = cpu;
- max_tr.time_start = data->preempt_timestamp;
+/*
+ * trace_get_user - reads the user input string separated by space
+ * (matched by isspace(ch))
+ *
+ * For each string found the 'struct trace_parser' is updated,
+ * and the function returns.
+ *
+ * Returns number of bytes read.
+ *
+ * See kernel/trace/trace.h for 'struct trace_parser' details.
+ */
+int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char ch;
+ size_t read = 0;
+ ssize_t ret;
+
+ if (!*ppos)
+ trace_parser_clear(parser);
- data = max_tr.data[cpu];
- data->saved_latency = tracing_max_latency;
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
- memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
- data->pid = tsk->pid;
- data->uid = task_uid(tsk);
- data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
- data->policy = tsk->policy;
- data->rt_priority = tsk->rt_priority;
+ read++;
+ cnt--;
- /* record this tasks comm */
- tracing_record_cmdline(tsk);
+ /*
+ * The parser is not finished with the last write,
+ * continue reading the user input without skipping spaces.
+ */
+ if (!parser->cont) {
+ /* skip white space */
+ while (cnt && isspace(ch)) {
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+ }
+
+ /* only spaces were written */
+ if (isspace(ch)) {
+ *ppos += read;
+ ret = read;
+ goto out;
+ }
+
+ parser->idx = 0;
+ }
+
+ /* read the non-space input */
+ while (cnt && !isspace(ch)) {
+ if (parser->idx < parser->size - 1)
+ parser->buffer[parser->idx++] = ch;
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+ }
+
+ /* We either got finished input or we have to wait for another call. */
+ if (isspace(ch)) {
+ parser->buffer[parser->idx] = 0;
+ parser->cont = false;
+ } else {
+ parser->cont = true;
+ parser->buffer[parser->idx++] = ch;
+ }
+
+ *ppos += read;
+ ret = read;
+
+out:
+ return ret;
}
ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
@@ -411,6 +587,57 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
return cnt;
}
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a arch_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ *
+ * It is also used in other places outside the update_max_tr
+ * so it needs to be defined outside of the
+ * CONFIG_TRACER_MAX_TRACE.
+ */
+static arch_spinlock_t ftrace_max_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+unsigned long __read_mostly tracing_thresh;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+unsigned long __read_mostly tracing_max_latency;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct trace_array_cpu *data = tr->data[cpu];
+ struct trace_array_cpu *max_data;
+
+ max_tr.cpu = cpu;
+ max_tr.time_start = data->preempt_timestamp;
+
+ max_data = max_tr.data[cpu];
+ max_data->saved_latency = tracing_max_latency;
+ max_data->critical_start = data->critical_start;
+ max_data->critical_end = data->critical_end;
+
+ memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ max_data->pid = tsk->pid;
+ max_data->uid = task_uid(tsk);
+ max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ max_data->policy = tsk->policy;
+ max_data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(tsk);
+}
+
/**
* update_max_tr - snapshot all trace buffers from global_trace to max_tr
* @tr: tracer
@@ -425,18 +652,21 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
struct ring_buffer *buf = tr->buffer;
+ if (trace_stop_count)
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
- __raw_spin_lock(&ftrace_max_lock);
+ if (!current_trace->use_max_tr) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ arch_spin_lock(&ftrace_max_lock);
tr->buffer = max_tr.buffer;
max_tr.buffer = buf;
- ftrace_disable_cpu();
- ring_buffer_reset(tr->buffer);
- ftrace_enable_cpu();
-
__update_max_tr(tr, tsk, cpu);
- __raw_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&ftrace_max_lock);
}
/**
@@ -452,21 +682,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
int ret;
+ if (trace_stop_count)
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
- __raw_spin_lock(&ftrace_max_lock);
+ if (!current_trace->use_max_tr) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ arch_spin_lock(&ftrace_max_lock);
ftrace_disable_cpu();
- ring_buffer_reset(max_tr.buffer);
ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+ if (ret == -EBUSY) {
+ /*
+ * We failed to swap the buffer due to a commit taking
+ * place on this CPU. We fail to record, but we reset
+ * the max trace buffer (no one writes directly to it)
+ * and flag that it failed.
+ */
+ trace_array_printk(&max_tr, _THIS_IP_,
+ "Failed to swap buffers due to commit in progress\n");
+ }
+
ftrace_enable_cpu();
- WARN_ON_ONCE(ret && ret != -EAGAIN);
+ WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
__update_max_tr(tr, tsk, cpu);
- __raw_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&ftrace_max_lock);
}
+#endif /* CONFIG_TRACER_MAX_TRACE */
/**
* register_tracer - register a tracer with the ftrace system.
@@ -479,7 +728,6 @@ __releases(kernel_lock)
__acquires(kernel_lock)
{
struct tracer *t;
- int len;
int ret = 0;
if (!type->name) {
@@ -487,13 +735,11 @@ __acquires(kernel_lock)
return -1;
}
- /*
- * When this gets called we hold the BKL which means that
- * preemption is disabled. Various trace selftests however
- * need to disable and enable preemption for successful tests.
- * So we drop the BKL here and grab it after the tests again.
- */
- unlock_kernel();
+ if (strlen(type->name) >= MAX_TRACER_SIZE) {
+ pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
+ return -1;
+ }
+
mutex_lock(&trace_types_lock);
tracing_selftest_running = true;
@@ -501,7 +747,7 @@ __acquires(kernel_lock)
for (t = trace_types; t; t = t->next) {
if (strcmp(type->name, t->name) == 0) {
/* already found */
- pr_info("Trace %s already registered\n",
+ pr_info("Tracer %s already registered\n",
type->name);
ret = -1;
goto out;
@@ -523,7 +769,6 @@ __acquires(kernel_lock)
if (type->selftest && !tracing_selftest_disabled) {
struct tracer *saved_tracer = current_trace;
struct trace_array *tr = &global_trace;
- int i;
/*
* Run a selftest on this tracer.
@@ -532,8 +777,7 @@ __acquires(kernel_lock)
* internal tracing to verify that everything is in order.
* If we fail, we do not register this tracer.
*/
- for_each_tracing_cpu(i)
- tracing_reset(tr, i);
+ tracing_reset_online_cpus(tr);
current_trace = type;
/* the test is responsible for initializing and enabling */
@@ -546,8 +790,7 @@ __acquires(kernel_lock)
goto out;
}
/* Only reset on passing, to avoid touching corrupted buffers */
- for_each_tracing_cpu(i)
- tracing_reset(tr, i);
+ tracing_reset_online_cpus(tr);
printk(KERN_CONT "PASSED\n");
}
@@ -555,9 +798,6 @@ __acquires(kernel_lock)
type->next = trace_types;
trace_types = type;
- len = strlen(type->name);
- if (len > max_tracer_type_len)
- max_tracer_type_len = len;
out:
tracing_selftest_running = false;
@@ -566,7 +806,7 @@ __acquires(kernel_lock)
if (ret || !default_bootup_tracer)
goto out_unlock;
- if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+ if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
goto out_unlock;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -581,21 +821,19 @@ __acquires(kernel_lock)
#endif
out_unlock:
- lock_kernel();
return ret;
}
void unregister_tracer(struct tracer *type)
{
struct tracer **t;
- int len;
mutex_lock(&trace_types_lock);
for (t = &trace_types; *t; t = &(*t)->next) {
if (*t == type)
goto found;
}
- pr_info("Trace %s not registered\n", type->name);
+ pr_info("Tracer %s not registered\n", type->name);
goto out;
found:
@@ -608,35 +846,46 @@ void unregister_tracer(struct tracer *type)
current_trace->stop(&global_trace);
current_trace = &nop_trace;
}
-
- if (strlen(type->name) != max_tracer_type_len)
- goto out;
-
- max_tracer_type_len = 0;
- for (t = &trace_types; *t; t = &(*t)->next) {
- len = strlen((*t)->name);
- if (len > max_tracer_type_len)
- max_tracer_type_len = len;
- }
- out:
+out:
mutex_unlock(&trace_types_lock);
}
-void tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
{
ftrace_disable_cpu();
- ring_buffer_reset_cpu(tr->buffer, cpu);
+ ring_buffer_reset_cpu(buffer, cpu);
ftrace_enable_cpu();
}
+void tracing_reset(struct trace_array *tr, int cpu)
+{
+ struct ring_buffer *buffer = tr->buffer;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+ __tracing_reset(buffer, cpu);
+
+ ring_buffer_record_enable(buffer);
+}
+
void tracing_reset_online_cpus(struct trace_array *tr)
{
+ struct ring_buffer *buffer = tr->buffer;
int cpu;
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- tracing_reset(tr, cpu);
+ __tracing_reset(buffer, cpu);
+
+ ring_buffer_record_enable(buffer);
}
void tracing_reset_current(int cpu)
@@ -655,7 +904,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
static int cmdline_idx;
-static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
/* temporary disable recording */
static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -667,8 +916,10 @@ static void trace_init_cmdlines(void)
cmdline_idx = 0;
}
-static int trace_stop_count;
-static DEFINE_SPINLOCK(tracing_start_lock);
+int is_tracing_stopped(void)
+{
+ return trace_stop_count;
+}
/**
* ftrace_off_permanent - disable all ftrace code permanently
@@ -709,6 +960,8 @@ void tracing_start(void)
goto out;
}
+ /* Prevent the buffers from switching */
+ arch_spin_lock(&ftrace_max_lock);
buffer = global_trace.buffer;
if (buffer)
@@ -718,6 +971,8 @@ void tracing_start(void)
if (buffer)
ring_buffer_record_enable(buffer);
+ arch_spin_unlock(&ftrace_max_lock);
+
ftrace_start();
out:
spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -739,6 +994,9 @@ void tracing_stop(void)
if (trace_stop_count++)
goto out;
+ /* Prevent the buffers from switching */
+ arch_spin_lock(&ftrace_max_lock);
+
buffer = global_trace.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
@@ -747,6 +1005,8 @@ void tracing_stop(void)
if (buffer)
ring_buffer_record_disable(buffer);
+ arch_spin_unlock(&ftrace_max_lock);
+
out:
spin_unlock_irqrestore(&tracing_start_lock, flags);
}
@@ -766,7 +1026,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!__raw_spin_trylock(&trace_cmdline_lock))
+ if (!arch_spin_trylock(&trace_cmdline_lock))
return;
idx = map_pid_to_cmdline[tsk->pid];
@@ -791,7 +1051,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
- __raw_spin_unlock(&trace_cmdline_lock);
+ arch_spin_unlock(&trace_cmdline_lock);
}
void trace_find_cmdline(int pid, char comm[])
@@ -803,20 +1063,25 @@ void trace_find_cmdline(int pid, char comm[])
return;
}
+ if (WARN_ON_ONCE(pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
if (pid > PID_MAX_DEFAULT) {
strcpy(comm, "<...>");
return;
}
preempt_disable();
- __raw_spin_lock(&trace_cmdline_lock);
+ arch_spin_lock(&trace_cmdline_lock);
map = map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
strcpy(comm, saved_cmdlines[map]);
else
strcpy(comm, "<...>");
- __raw_spin_unlock(&trace_cmdline_lock);
+ arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
}
@@ -837,7 +1102,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->tgid = (tsk) ? tsk->tgid : 0;
+ entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -848,15 +1113,17 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
}
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
-struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
- int type,
- unsigned long len,
- unsigned long flags, int pc)
+struct ring_buffer_event *
+trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags, int pc)
{
struct ring_buffer_event *event;
- event = ring_buffer_lock_reserve(tr->buffer, len);
+ event = ring_buffer_lock_reserve(buffer, len);
if (event != NULL) {
struct trace_entry *ent = ring_buffer_event_data(event);
@@ -866,58 +1133,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
return event;
}
-static void ftrace_trace_stack(struct trace_array *tr,
- unsigned long flags, int skip, int pc);
-static void ftrace_trace_userstack(struct trace_array *tr,
- unsigned long flags, int pc);
-static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
- struct ring_buffer_event *event,
- unsigned long flags, int pc,
- int wake)
+static inline void
+__trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc,
+ int wake)
{
- ring_buffer_unlock_commit(tr->buffer, event);
+ ring_buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, flags, 6, pc);
- ftrace_trace_userstack(tr, flags, pc);
+ ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_userstack(buffer, flags, pc);
if (wake)
trace_wake_up();
}
-void trace_buffer_unlock_commit(struct trace_array *tr,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
{
- __trace_buffer_unlock_commit(tr, event, flags, pc, 1);
+ __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
}
struct ring_buffer_event *
-trace_current_buffer_lock_reserve(int type, unsigned long len,
+trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
+ int type, unsigned long len,
unsigned long flags, int pc)
{
- return trace_buffer_lock_reserve(&global_trace,
+ *current_rb = global_trace.buffer;
+ return trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
unsigned long flags, int pc)
{
- __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+ __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
}
EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
- unsigned long flags, int pc)
+void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
{
- __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+ __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
}
EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
-void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
{
- ring_buffer_discard_commit(global_trace.buffer, event);
+ ring_buffer_discard_commit(buffer, event);
}
EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
@@ -927,14 +1196,15 @@ trace_function(struct trace_array *tr,
int pc)
{
struct ftrace_event_call *call = &event_function;
+ struct ring_buffer *buffer = tr->buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
/* If we are reading the ring buffer, don't trace */
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
return;
- event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
+ event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
flags, pc);
if (!event)
return;
@@ -942,57 +1212,9 @@ trace_function(struct trace_array *tr,
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-}
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int __trace_graph_entry(struct trace_array *tr,
- struct ftrace_graph_ent *trace,
- unsigned long flags,
- int pc)
-{
- struct ftrace_event_call *call = &event_funcgraph_entry;
- struct ring_buffer_event *event;
- struct ftrace_graph_ent_entry *entry;
-
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
- return 0;
-
- event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
- sizeof(*entry), flags, pc);
- if (!event)
- return 0;
- entry = ring_buffer_event_data(event);
- entry->graph_ent = *trace;
- if (!filter_current_check_discard(call, entry, event))
- ring_buffer_unlock_commit(global_trace.buffer, event);
-
- return 1;
-}
-
-static void __trace_graph_return(struct trace_array *tr,
- struct ftrace_graph_ret *trace,
- unsigned long flags,
- int pc)
-{
- struct ftrace_event_call *call = &event_funcgraph_exit;
- struct ring_buffer_event *event;
- struct ftrace_graph_ret_entry *entry;
-
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
- return;
-
- event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->ret = *trace;
- if (!filter_current_check_discard(call, entry, event))
- ring_buffer_unlock_commit(global_trace.buffer, event);
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
}
-#endif
void
ftrace(struct trace_array *tr, struct trace_array_cpu *data,
@@ -1003,17 +1225,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
trace_function(tr, ip, parent_ip, flags, pc);
}
-static void __ftrace_trace_stack(struct trace_array *tr,
+#ifdef CONFIG_STACKTRACE
+static void __ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags,
int skip, int pc)
{
-#ifdef CONFIG_STACKTRACE
struct ftrace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
- event = trace_buffer_lock_reserve(tr, TRACE_STACK,
+ event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
sizeof(*entry), flags, pc);
if (!event)
return;
@@ -1026,32 +1248,46 @@ static void __ftrace_trace_stack(struct trace_array *tr,
trace.entries = entry->caller;
save_stack_trace(&trace);
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-#endif
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
}
-static void ftrace_trace_stack(struct trace_array *tr,
- unsigned long flags,
- int skip, int pc)
+void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc)
{
if (!(trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(tr, flags, skip, pc);
+ __ftrace_trace_stack(buffer, flags, skip, pc);
}
-void __trace_stack(struct trace_array *tr,
- unsigned long flags,
- int skip, int pc)
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc)
{
- __ftrace_trace_stack(tr, flags, skip, pc);
+ __ftrace_trace_stack(tr->buffer, flags, skip, pc);
}
-static void ftrace_trace_userstack(struct trace_array *tr,
- unsigned long flags, int pc)
+/**
+ * trace_dump_stack - record a stack back trace in the trace buffer
+ */
+void trace_dump_stack(void)
+{
+ unsigned long flags;
+
+ if (tracing_disabled || tracing_selftest_running)
+ return;
+
+ local_save_flags(flags);
+
+ /* skipping 3 traces, seems to get us at the caller of this function */
+ __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+}
+
+static DEFINE_PER_CPU(int, user_stack_count);
+
+void
+ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
{
-#ifdef CONFIG_STACKTRACE
struct ftrace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
@@ -1060,12 +1296,30 @@ static void ftrace_trace_userstack(struct trace_array *tr,
if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
- event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
+ /*
+ * NMIs can not handle page faults, even with fix ups.
+ * The save user stack can (and often does) fault.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
+ /*
+ * prevent recursion, since the user stack tracing may
+ * trigger other kernel events.
+ */
+ preempt_disable();
+ if (__this_cpu_read(user_stack_count))
+ goto out;
+
+ __this_cpu_inc(user_stack_count);
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
sizeof(*entry), flags, pc);
if (!event)
- return;
+ goto out_drop_count;
entry = ring_buffer_event_data(event);
+ entry->tgid = current->tgid;
memset(&entry->caller, 0, sizeof(entry->caller));
trace.nr_entries = 0;
@@ -1074,9 +1328,13 @@ static void ftrace_trace_userstack(struct trace_array *tr,
trace.entries = entry->caller;
save_stack_trace_user(&trace);
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
-#endif
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
+
+ out_drop_count:
+ __this_cpu_dec(user_stack_count);
+ out:
+ preempt_enable();
}
#ifdef UNUSED
@@ -1086,174 +1344,7 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
}
#endif /* UNUSED */
-static void
-ftrace_trace_special(void *__tr,
- unsigned long arg1, unsigned long arg2, unsigned long arg3,
- int pc)
-{
- struct ring_buffer_event *event;
- struct trace_array *tr = __tr;
- struct special_entry *entry;
-
- event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
- sizeof(*entry), 0, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->arg1 = arg1;
- entry->arg2 = arg2;
- entry->arg3 = arg3;
- trace_buffer_unlock_commit(tr, event, 0, pc);
-}
-
-void
-__trace_special(void *__tr, void *__data,
- unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
- ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
-}
-
-void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(tr, TRACE_CTX,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- trace_buffer_unlock_commit(tr, event, flags, pc);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_wakeup;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
-
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
- ftrace_trace_stack(tr, flags, 6, pc);
- ftrace_trace_userstack(tr, flags, pc);
-}
-
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu;
- int pc;
-
- if (tracing_disabled)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
-
- if (likely(atomic_inc_return(&data->disabled) == 1))
- ftrace_trace_special(tr, arg1, arg2, arg3, pc);
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-int trace_graph_entry(struct ftrace_graph_ent *trace)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int ret;
- int cpu;
- int pc;
-
- if (!ftrace_trace_task(current))
- return 0;
-
- if (!ftrace_graph_addr(trace->func))
- return 0;
-
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- pc = preempt_count();
- ret = __trace_graph_entry(tr, trace, flags, pc);
- } else {
- ret = 0;
- }
- /* Only do the atomic if it is not already set */
- if (!test_tsk_trace_graph(current))
- set_tsk_trace_graph(current);
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-
- return ret;
-}
-
-void trace_graph_return(struct ftrace_graph_ret *trace)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, trace, flags, pc);
- }
- if (!trace->depth)
- clear_tsk_trace_graph(current);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
+#endif /* CONFIG_STACKTRACE */
/**
* trace_vbprintk - write binary msg to tracing buffer
@@ -1261,18 +1352,18 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- static raw_spinlock_t trace_buf_lock =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ static arch_spinlock_t trace_buf_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
static u32 trace_buf[TRACE_BUF_SIZE];
struct ftrace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
struct bprint_entry *entry;
unsigned long flags;
int disable;
- int resched;
int cpu, len = 0, size, pc;
if (unlikely(tracing_selftest_running || tracing_disabled))
@@ -1282,7 +1373,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
pause_graph_tracing();
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = tr->data[cpu];
@@ -1292,14 +1383,16 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
/* Lockdep uses trace_printk for lock tracing */
local_irq_save(flags);
- __raw_spin_lock(&trace_buf_lock);
+ arch_spin_lock(&trace_buf_lock);
len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
if (len > TRACE_BUF_SIZE || len < 0)
goto out_unlock;
size = sizeof(*entry) + sizeof(u32) * len;
- event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ flags, pc);
if (!event)
goto out_unlock;
entry = ring_buffer_event_data(event);
@@ -1307,30 +1400,48 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
entry->fmt = fmt;
memcpy(entry->buf, trace_buf, sizeof(u32) * len);
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, buffer, event)) {
+ ring_buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, flags, 6, pc);
+ }
out_unlock:
- __raw_spin_unlock(&trace_buf_lock);
+ arch_spin_unlock(&trace_buf_lock);
local_irq_restore(flags);
out:
atomic_dec_return(&data->disabled);
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
unpause_graph_tracing();
return len;
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
-int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
{
- static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static char trace_buf[TRACE_BUF_SIZE];
struct ftrace_event_call *call = &event_print;
struct ring_buffer_event *event;
- struct trace_array *tr = &global_trace;
+ struct ring_buffer *buffer;
struct trace_array_cpu *data;
int cpu, len = 0, size, pc;
struct print_entry *entry;
@@ -1351,26 +1462,27 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
pause_graph_tracing();
raw_local_irq_save(irq_flags);
- __raw_spin_lock(&trace_buf_lock);
+ arch_spin_lock(&trace_buf_lock);
len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
- len = min(len, TRACE_BUF_SIZE-1);
- trace_buf[len] = 0;
-
size = sizeof(*entry) + len + 1;
- event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, pc);
if (!event)
goto out_unlock;
entry = ring_buffer_event_data(event);
- entry->ip = ip;
+ entry->ip = ip;
memcpy(&entry->buf, trace_buf, len);
- entry->buf[len] = 0;
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
+ entry->buf[len] = '\0';
+ if (!filter_check_discard(call, entry, buffer, event)) {
+ ring_buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 6, pc);
+ }
out_unlock:
- __raw_spin_unlock(&trace_buf_lock);
+ arch_spin_unlock(&trace_buf_lock);
raw_local_irq_restore(irq_flags);
unpause_graph_tracing();
out:
@@ -1379,12 +1491,12 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
return len;
}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-enum trace_file_type {
- TRACE_FILE_LAT_FMT = 1,
- TRACE_FILE_ANNOTATE = 2,
-};
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ return trace_array_vprintk(&global_trace, ip, fmt, args);
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
static void trace_iterator_increment(struct trace_iterator *iter)
{
@@ -1399,7 +1511,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
}
static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1410,7 +1523,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
- event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+ event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+ lost_events);
ftrace_enable_cpu();
@@ -1418,10 +1532,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
}
static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+ unsigned long *missing_events, u64 *ent_ts)
{
struct ring_buffer *buffer = iter->tr->buffer;
struct trace_entry *ent, *next = NULL;
+ unsigned long lost_events = 0, next_lost = 0;
int cpu_file = iter->cpu_file;
u64 next_ts = 0, ts;
int next_cpu = -1;
@@ -1434,7 +1550,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
if (cpu_file > TRACE_PIPE_ALL_CPU) {
if (ring_buffer_empty_cpu(buffer, cpu_file))
return NULL;
- ent = peek_next_entry(iter, cpu_file, ent_ts);
+ ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
if (ent_cpu)
*ent_cpu = cpu_file;
@@ -1446,7 +1562,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
if (ring_buffer_empty_cpu(buffer, cpu))
continue;
- ent = peek_next_entry(iter, cpu, &ts);
+ ent = peek_next_entry(iter, cpu, &ts, &lost_events);
/*
* Pick the entry with the smallest timestamp:
@@ -1455,6 +1571,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
next = ent;
next_cpu = cpu;
next_ts = ts;
+ next_lost = lost_events;
}
}
@@ -1464,6 +1581,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
if (ent_ts)
*ent_ts = next_ts;
+ if (missing_events)
+ *missing_events = next_lost;
+
return next;
}
@@ -1471,13 +1591,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts)
{
- return __find_next_entry(iter, ent_cpu, ent_ts);
+ return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
}
/* Find the next real entry, and increment the iterator to the next entry */
-static void *find_next_entry_inc(struct trace_iterator *iter)
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
{
- iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+ iter->ent = __find_next_entry(iter, &iter->cpu,
+ &iter->lost_events, &iter->ts);
if (iter->ent)
trace_iterator_increment(iter);
@@ -1489,7 +1610,8 @@ static void trace_consume(struct trace_iterator *iter)
{
/* Don't allow ftrace to trace into the ring buffers */
ftrace_disable_cpu();
- ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+ &iter->lost_events);
ftrace_enable_cpu();
}
@@ -1499,6 +1621,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
int i = (int)*pos;
void *ent;
+ WARN_ON_ONCE(iter->leftover);
+
(*pos)++;
/* can't go backwards */
@@ -1506,25 +1630,50 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
return NULL;
if (iter->idx < 0)
- ent = find_next_entry_inc(iter);
+ ent = trace_find_next_entry_inc(iter);
else
ent = iter;
while (ent && iter->idx < i)
- ent = find_next_entry_inc(iter);
+ ent = trace_find_next_entry_inc(iter);
iter->pos = *pos;
return ent;
}
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+{
+ struct trace_array *tr = iter->tr;
+ struct ring_buffer_event *event;
+ struct ring_buffer_iter *buf_iter;
+ unsigned long entries = 0;
+ u64 ts;
+
+ tr->data[cpu]->skipped_entries = 0;
+
+ if (!iter->buffer_iter[cpu])
+ return;
+
+ buf_iter = iter->buffer_iter[cpu];
+ ring_buffer_iter_reset(buf_iter);
+
+ /*
+ * We could have the case with the max latency tracers
+ * that a reset never took place on a cpu. This is evident
+ * by the timestamp being before the start of the buffer.
+ */
+ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
+ if (ts >= iter->tr->time_start)
+ break;
+ entries++;
+ ring_buffer_read(buf_iter, NULL);
+ }
+
+ tr->data[cpu]->skipped_entries = entries;
+}
+
/*
- * No necessary locking here. The worst thing which can
- * happen is loosing events consumed at the same time
- * by a trace_pipe reader.
- * Other than that, we don't risk to crash the ring buffer
- * because it serializes the readers.
- *
* The current tracer is copied to avoid a global locking
* all around.
*/
@@ -1556,28 +1705,40 @@ static void *s_start(struct seq_file *m, loff_t *pos)
if (cpu_file == TRACE_PIPE_ALL_CPU) {
for_each_tracing_cpu(cpu)
- ring_buffer_iter_reset(iter->buffer_iter[cpu]);
+ tracing_iter_reset(iter, cpu);
} else
- ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
-
+ tracing_iter_reset(iter, cpu_file);
ftrace_enable_cpu();
+ iter->leftover = 0;
for (p = iter; p && l < *pos; p = s_next(m, p, &l))
;
} else {
- l = *pos - 1;
- p = s_next(m, p, &l);
+ /*
+ * If we overflowed the seq_file before, then we want
+ * to just reuse the trace_seq buffer again.
+ */
+ if (iter->leftover)
+ p = iter;
+ else {
+ l = *pos - 1;
+ p = s_next(m, p, &l);
+ }
}
trace_event_read_lock();
+ trace_access_lock(cpu_file);
return p;
}
static void s_stop(struct seq_file *m, void *p)
{
+ struct trace_iterator *iter = m->private;
+
atomic_dec(&trace_record_cmdline_disabled);
+ trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
@@ -1588,10 +1749,10 @@ static void print_lat_help_header(struct seq_file *m)
seq_puts(m, "# | / _----=> need-resched \n");
seq_puts(m, "# || / _---=> hardirq/softirq \n");
seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / \n");
- seq_puts(m, "# ||||| delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# |||| /_--=> lock-depth \n");
+ seq_puts(m, "# |||||/ delay \n");
+ seq_puts(m, "# cmd pid |||||| time | caller \n");
+ seq_puts(m, "# \\ / |||||| \\ | / \n");
}
static void print_func_help_header(struct seq_file *m)
@@ -1601,23 +1762,39 @@ static void print_func_help_header(struct seq_file *m)
}
-static void
+void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct trace_array *tr = iter->tr;
struct trace_array_cpu *data = tr->data[tr->cpu];
struct tracer *type = current_trace;
- unsigned long total;
- unsigned long entries;
+ unsigned long entries = 0;
+ unsigned long total = 0;
+ unsigned long count;
const char *name = "preemption";
+ int cpu;
if (type)
name = type->name;
- entries = ring_buffer_entries(iter->tr->buffer);
- total = entries +
- ring_buffer_overruns(iter->tr->buffer);
+
+ for_each_tracing_cpu(cpu) {
+ count = ring_buffer_entries_cpu(tr->buffer, cpu);
+ /*
+ * If this buffer has skipped entries, then we hold all
+ * entries for the trace and we need to ignore the
+ * ones before the time stamp.
+ */
+ if (tr->data[cpu]->skipped_entries) {
+ count -= tr->data[cpu]->skipped_entries;
+ /* total is the same as the entries */
+ total += count;
+ } else
+ total += count +
+ ring_buffer_overrun_cpu(tr->buffer, cpu);
+ entries += count;
+ }
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
@@ -1659,7 +1836,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
seq_puts(m, "\n# => ended at: ");
seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
trace_print_seq(m, &iter->seq);
- seq_puts(m, "#\n");
+ seq_puts(m, "\n#\n");
}
seq_puts(m, "#\n");
@@ -1678,6 +1855,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
if (cpumask_test_cpu(iter->cpu, iter->started))
return;
+ if (iter->tr->data[iter->cpu]->skipped_entries)
+ return;
+
cpumask_set_cpu(iter->cpu, iter->started);
/* Don't print started cpu buffer for the first entry of the trace */
@@ -1710,7 +1890,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
}
if (event)
- return event->trace(iter, sym_flags);
+ return event->funcs->trace(iter, sym_flags, event);
if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
goto partial;
@@ -1736,7 +1916,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event)
- return event->raw(iter, 0);
+ return event->funcs->raw(iter, 0, event);
if (!trace_seq_printf(s, "%d ?\n", entry->type))
goto partial;
@@ -1763,7 +1943,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event) {
- enum print_line_t ret = event->hex(iter, 0);
+ enum print_line_t ret = event->funcs->hex(iter, 0, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -1788,10 +1968,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
}
event = ftrace_find_event(entry->type);
- return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+ return event ? event->funcs->binary(iter, 0, event) :
+ TRACE_TYPE_HANDLED;
}
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
{
int cpu;
@@ -1822,10 +2003,14 @@ static int trace_empty(struct trace_iterator *iter)
}
/* Called with trace_event_read_lock() held. */
-static enum print_line_t print_trace_line(struct trace_iterator *iter)
+enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
+ if (iter->lost_events)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
if (ret != TRACE_TYPE_UNHANDLED)
@@ -1854,9 +2039,27 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
return print_trace_fmt(iter);
}
+void trace_default_header(struct seq_file *m)
+{
+ struct trace_iterator *iter = m->private;
+
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+ print_trace_header(m, iter);
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_lat_help_header(m);
+ } else {
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_func_help_header(m);
+ }
+}
+
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
+ int ret;
if (iter->ent == NULL) {
if (iter->tr) {
@@ -1865,26 +2068,36 @@ static int s_show(struct seq_file *m, void *v)
}
if (iter->trace && iter->trace->print_header)
iter->trace->print_header(m);
- else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
- /* print nothing if the buffers are empty */
- if (trace_empty(iter))
- return 0;
- print_trace_header(m, iter);
- if (!(trace_flags & TRACE_ITER_VERBOSE))
- print_lat_help_header(m);
- } else {
- if (!(trace_flags & TRACE_ITER_VERBOSE))
- print_func_help_header(m);
- }
+ else
+ trace_default_header(m);
+
+ } else if (iter->leftover) {
+ /*
+ * If we filled the seq_file buffer earlier, we
+ * want to just show it now.
+ */
+ ret = trace_print_seq(m, &iter->seq);
+
+ /* ret should this time be zero, but you never know */
+ iter->leftover = ret;
+
} else {
print_trace_line(iter);
- trace_print_seq(m, &iter->seq);
+ ret = trace_print_seq(m, &iter->seq);
+ /*
+ * If we overflow the seq_file buffer, then it will
+ * ask us for this data again at start up.
+ * Use that instead.
+ * ret is 0 if seq_file write succeeded.
+ * -1 otherwise.
+ */
+ iter->leftover = ret;
}
return 0;
}
-static struct seq_operations tracer_seq_ops = {
+static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
.stop = s_stop,
@@ -1919,11 +2132,9 @@ __tracing_open(struct inode *inode, struct file *file)
if (current_trace)
*iter->trace = *current_trace;
- if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
- cpumask_clear(iter->started);
-
if (current_trace && current_trace->print_max)
iter->tr = &max_tr;
else
@@ -1940,19 +2151,28 @@ __tracing_open(struct inode *inode, struct file *file)
if (ring_buffer_overruns(iter->tr->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
+ /* stop the trace while dumping */
+ tracing_stop();
+
if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
for_each_tracing_cpu(cpu) {
-
iter->buffer_iter[cpu] =
- ring_buffer_read_start(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ }
+ ring_buffer_read_prepare_sync();
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_read_start(iter->buffer_iter[cpu]);
+ tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_start(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare_sync();
+ ring_buffer_read_start(iter->buffer_iter[cpu]);
+ tracing_iter_reset(iter, cpu);
}
- /* TODO stop tracer */
ret = seq_open(file, &tracer_seq_ops);
if (ret < 0) {
fail_ret = ERR_PTR(ret);
@@ -1962,9 +2182,6 @@ __tracing_open(struct inode *inode, struct file *file)
m = file->private_data;
m->private = iter;
- /* stop the trace while dumping */
- tracing_stop();
-
mutex_unlock(&trace_types_lock);
return iter;
@@ -1975,6 +2192,7 @@ __tracing_open(struct inode *inode, struct file *file)
ring_buffer_read_finish(iter->buffer_iter[cpu]);
}
free_cpumask_var(iter->started);
+ tracing_start();
fail:
mutex_unlock(&trace_types_lock);
kfree(iter->trace);
@@ -1994,7 +2212,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
static int tracing_release(struct inode *inode, struct file *file)
{
- struct seq_file *m = (struct seq_file *)file->private_data;
+ struct seq_file *m = file->private_data;
struct trace_iterator *iter;
int cpu;
@@ -2031,7 +2249,7 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
long cpu = (long) inode->i_private;
if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2053,25 +2271,23 @@ static int tracing_open(struct inode *inode, struct file *file)
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct tracer *t = m->private;
+ struct tracer *t = v;
(*pos)++;
if (t)
t = t->next;
- m->private = t;
-
return t;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct tracer *t = m->private;
+ struct tracer *t;
loff_t l = 0;
mutex_lock(&trace_types_lock);
- for (; t && l < *pos; t = t_next(m, t, &l))
+ for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
;
return t;
@@ -2098,7 +2314,7 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
-static struct seq_operations show_traces_seq_ops = {
+static const struct seq_operations show_traces_seq_ops = {
.start = t_start,
.next = t_next,
.stop = t_stop,
@@ -2107,18 +2323,10 @@ static struct seq_operations show_traces_seq_ops = {
static int show_traces_open(struct inode *inode, struct file *file)
{
- int ret;
-
if (tracing_disabled)
return -ENODEV;
- ret = seq_open(file, &show_traces_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = trace_types;
- }
-
- return ret;
+ return seq_open(file, &show_traces_seq_ops);
}
static ssize_t
@@ -2128,11 +2336,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
return count;
}
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+ if (file->f_mode & FMODE_READ)
+ return seq_lseek(file, offset, origin);
+ else
+ return 0;
+}
+
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.write = tracing_write_stub,
- .llseek = seq_lseek,
+ .llseek = tracing_seek,
.release = tracing_release,
};
@@ -2140,6 +2356,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.release = seq_release,
+ .llseek = seq_lseek,
};
/*
@@ -2198,7 +2415,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
mutex_lock(&tracing_cpumask_update_lock);
local_irq_disable();
- __raw_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&ftrace_max_lock);
for_each_tracing_cpu(cpu) {
/*
* Increase/decrease the disabled counter if we are
@@ -2213,7 +2430,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
atomic_dec(&global_trace.data[cpu]->disabled);
}
}
- __raw_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&ftrace_max_lock);
local_irq_enable();
cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2233,103 +2450,70 @@ static const struct file_operations tracing_cpumask_fops = {
.open = tracing_open_generic,
.read = tracing_cpumask_read,
.write = tracing_cpumask_write,
+ .llseek = generic_file_llseek,
};
-static ssize_t
-tracing_trace_options_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int tracing_trace_options_show(struct seq_file *m, void *v)
{
struct tracer_opt *trace_opts;
u32 tracer_flags;
- int len = 0;
- char *buf;
- int r = 0;
int i;
-
- /* calculate max size */
- for (i = 0; trace_options[i]; i++) {
- len += strlen(trace_options[i]);
- len += 3; /* "no" and newline */
- }
-
mutex_lock(&trace_types_lock);
tracer_flags = current_trace->flags->val;
trace_opts = current_trace->flags->opts;
- /*
- * Increase the size with names of options specific
- * of the current tracer.
- */
- for (i = 0; trace_opts[i].name; i++) {
- len += strlen(trace_opts[i].name);
- len += 3; /* "no" and newline */
- }
-
- /* +2 for \n and \0 */
- buf = kmalloc(len + 2, GFP_KERNEL);
- if (!buf) {
- mutex_unlock(&trace_types_lock);
- return -ENOMEM;
- }
-
for (i = 0; trace_options[i]; i++) {
if (trace_flags & (1 << i))
- r += sprintf(buf + r, "%s\n", trace_options[i]);
+ seq_printf(m, "%s\n", trace_options[i]);
else
- r += sprintf(buf + r, "no%s\n", trace_options[i]);
+ seq_printf(m, "no%s\n", trace_options[i]);
}
for (i = 0; trace_opts[i].name; i++) {
if (tracer_flags & trace_opts[i].bit)
- r += sprintf(buf + r, "%s\n",
- trace_opts[i].name);
+ seq_printf(m, "%s\n", trace_opts[i].name);
else
- r += sprintf(buf + r, "no%s\n",
- trace_opts[i].name);
+ seq_printf(m, "no%s\n", trace_opts[i].name);
}
mutex_unlock(&trace_types_lock);
- WARN_ON(r >= len + 2);
+ return 0;
+}
- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+static int __set_tracer_option(struct tracer *trace,
+ struct tracer_flags *tracer_flags,
+ struct tracer_opt *opts, int neg)
+{
+ int ret;
- kfree(buf);
- return r;
+ ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
+ if (ret)
+ return ret;
+
+ if (neg)
+ tracer_flags->val &= ~opts->bit;
+ else
+ tracer_flags->val |= opts->bit;
+ return 0;
}
/* Try to assign a tracer specific option */
static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
{
- struct tracer_flags *trace_flags = trace->flags;
+ struct tracer_flags *tracer_flags = trace->flags;
struct tracer_opt *opts = NULL;
- int ret = 0, i = 0;
- int len;
+ int i;
- for (i = 0; trace_flags->opts[i].name; i++) {
- opts = &trace_flags->opts[i];
- len = strlen(opts->name);
+ for (i = 0; tracer_flags->opts[i].name; i++) {
+ opts = &tracer_flags->opts[i];
- if (strncmp(cmp, opts->name, len) == 0) {
- ret = trace->set_flag(trace_flags->val,
- opts->bit, !neg);
- break;
- }
+ if (strcmp(cmp, opts->name) == 0)
+ return __set_tracer_option(trace, trace->flags,
+ opts, neg);
}
- /* Not found */
- if (!trace_flags->opts[i].name)
- return -EINVAL;
-
- /* Refused to handle */
- if (ret)
- return ret;
- if (neg)
- trace_flags->val &= ~opts->bit;
- else
- trace_flags->val |= opts->bit;
-
- return 0;
+ return -EINVAL;
}
static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2343,21 +2527,8 @@ static void set_tracer_flags(unsigned int mask, int enabled)
else
trace_flags &= ~mask;
- if (mask == TRACE_ITER_GLOBAL_CLK) {
- u64 (*func)(void);
-
- if (enabled)
- func = trace_clock_global;
- else
- func = trace_clock_local;
-
- mutex_lock(&trace_types_lock);
- ring_buffer_set_clock(global_trace.buffer, func);
-
- if (max_tr.buffer)
- ring_buffer_set_clock(max_tr.buffer, func);
- mutex_unlock(&trace_types_lock);
- }
+ if (mask == TRACE_ITER_RECORD_CMD)
+ trace_event_enable_cmd_record(enabled);
}
static ssize_t
@@ -2365,7 +2536,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
- char *cmp = buf;
+ char *cmp;
int neg = 0;
int ret;
int i;
@@ -2377,16 +2548,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
return -EFAULT;
buf[cnt] = 0;
+ cmp = strstrip(buf);
- if (strncmp(buf, "no", 2) == 0) {
+ if (strncmp(cmp, "no", 2) == 0) {
neg = 1;
cmp += 2;
}
for (i = 0; trace_options[i]; i++) {
- int len = strlen(trace_options[i]);
-
- if (strncmp(cmp, trace_options[i], len) == 0) {
+ if (strcmp(cmp, trace_options[i]) == 0) {
set_tracer_flags(1 << i, !neg);
break;
}
@@ -2401,14 +2571,23 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
return ret;
}
- filp->f_pos += cnt;
+ *ppos += cnt;
return cnt;
}
+static int tracing_trace_options_open(struct inode *inode, struct file *file)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+ return single_open(file, tracing_trace_options_show, NULL);
+}
+
static const struct file_operations tracing_iter_fops = {
- .open = tracing_open_generic,
- .read = tracing_trace_options_read,
+ .open = tracing_trace_options_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
.write = tracing_trace_options_write,
};
@@ -2441,6 +2620,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
static const struct file_operations tracing_readme_fops = {
.open = tracing_open_generic,
.read = tracing_readme_read,
+ .llseek = generic_file_llseek,
};
static ssize_t
@@ -2491,6 +2671,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
static const struct file_operations tracing_saved_cmdlines_fops = {
.open = tracing_open_generic,
.read = tracing_saved_cmdlines_read,
+ .llseek = generic_file_llseek,
};
static ssize_t
@@ -2543,7 +2724,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
}
mutex_unlock(&trace_types_lock);
- filp->f_pos += cnt;
+ *ppos += cnt;
return cnt;
}
@@ -2552,7 +2733,7 @@ static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[max_tracer_type_len+2];
+ char buf[MAX_TRACER_SIZE+2];
int r;
mutex_lock(&trace_types_lock);
@@ -2586,6 +2767,9 @@ static int tracing_resize_ring_buffer(unsigned long size)
if (ret < 0)
return ret;
+ if (!current_trace->use_max_tr)
+ goto out;
+
ret = ring_buffer_resize(max_tr.buffer, size);
if (ret < 0) {
int r;
@@ -2613,11 +2797,14 @@ static int tracing_resize_ring_buffer(unsigned long size)
return ret;
}
+ max_tr.entries = size;
+ out:
global_trace.entries = size;
return ret;
}
+
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
*
@@ -2678,12 +2865,26 @@ static int tracing_set_tracer(const char *buf)
trace_branch_disable();
if (current_trace && current_trace->reset)
current_trace->reset(tr);
-
+ if (current_trace && current_trace->use_max_tr) {
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_resize(max_tr.buffer, 1);
+ max_tr.entries = 1;
+ }
destroy_trace_option_files(topts);
current_trace = t;
topts = create_trace_option_files(current_trace);
+ if (current_trace->use_max_tr) {
+ ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+ if (ret < 0)
+ goto out;
+ max_tr.entries = global_trace.entries;
+ }
if (t->init) {
ret = tracer_init(t, tr);
@@ -2702,15 +2903,15 @@ static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[max_tracer_type_len+1];
+ char buf[MAX_TRACER_SIZE+1];
int i;
size_t ret;
int err;
ret = cnt;
- if (cnt > max_tracer_type_len)
- cnt = max_tracer_type_len;
+ if (cnt > MAX_TRACER_SIZE)
+ cnt = MAX_TRACER_SIZE;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
@@ -2725,7 +2926,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
if (err)
return err;
- filp->f_pos += ret;
+ *ppos += ret;
return ret;
}
@@ -2782,22 +2983,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
mutex_lock(&trace_types_lock);
- /* We only allow one reader per cpu */
- if (cpu_file == TRACE_PIPE_ALL_CPU) {
- if (!cpumask_empty(tracing_reader_cpumask)) {
- ret = -EBUSY;
- goto out;
- }
- cpumask_setall(tracing_reader_cpumask);
- } else {
- if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
- cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
- else {
- ret = -EBUSY;
- goto out;
- }
- }
-
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
@@ -2836,6 +3021,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (iter->trace->pipe_open)
iter->trace->pipe_open(iter);
+ nonseekable_open(inode, filp);
out:
mutex_unlock(&trace_types_lock);
return ret;
@@ -2853,10 +3039,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
- cpumask_clear(tracing_reader_cpumask);
- else
- cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+ if (iter->trace->pipe_close)
+ iter->trace->pipe_close(iter);
mutex_unlock(&trace_types_lock);
@@ -3016,7 +3200,8 @@ waitagain:
iter->pos = -1;
trace_event_read_lock();
- while (find_next_entry_inc(iter) != NULL) {
+ trace_access_lock(iter->cpu_file);
+ while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
int len = iter->seq.len;
@@ -3032,6 +3217,7 @@ waitagain:
if (iter->seq.len >= cnt)
break;
}
+ trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
/* Now copy what we have to the user */
@@ -3064,7 +3250,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
__free_page(spd->pages[idx]);
}
-static struct pipe_buf_operations tracing_pipe_buf_ops = {
+static const struct pipe_buf_operations tracing_pipe_buf_ops = {
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
@@ -3095,9 +3281,10 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
break;
}
- trace_consume(iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
rem -= count;
- if (!find_next_entry_inc(iter)) {
+ if (!trace_find_next_entry_inc(iter)) {
rem = 0;
iter->ent = NULL;
break;
@@ -3113,12 +3300,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
size_t len,
unsigned int flags)
{
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
struct trace_iterator *iter = filp->private_data;
struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
+ .pages = pages_def,
+ .partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.flags = flags,
.ops = &tracing_pipe_buf_ops,
@@ -3129,6 +3316,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
size_t rem;
unsigned int i;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3150,46 +3340,50 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
if (ret <= 0)
goto out_err;
- if (!iter->ent && !find_next_entry_inc(iter)) {
+ if (!iter->ent && !trace_find_next_entry_inc(iter)) {
ret = -EFAULT;
goto out_err;
}
trace_event_read_lock();
+ trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
- pages[i] = alloc_page(GFP_KERNEL);
- if (!pages[i])
+ for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+ spd.pages[i] = alloc_page(GFP_KERNEL);
+ if (!spd.pages[i])
break;
rem = tracing_fill_pipe_page(rem, iter);
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
- page_address(pages[i]),
+ page_address(spd.pages[i]),
iter->seq.len);
if (ret < 0) {
- __free_page(pages[i]);
+ __free_page(spd.pages[i]);
break;
}
- partial[i].offset = 0;
- partial[i].len = iter->seq.len;
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = iter->seq.len;
trace_seq_init(&iter->seq);
}
+ trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
mutex_unlock(&iter->mutex);
spd.nr_pages = i;
- return splice_to_pipe(pipe, &spd);
+ ret = splice_to_pipe(pipe, &spd);
+out:
+ splice_shrink_spd(pipe, &spd);
+ return ret;
out_err:
mutex_unlock(&iter->mutex);
-
- return ret;
+ goto out;
}
static ssize_t
@@ -3259,7 +3453,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
}
}
- filp->f_pos += cnt;
+ *ppos += cnt;
/* If check pages failed, return ENOMEM */
if (tracing_disabled)
@@ -3273,7 +3467,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
}
tracing_start();
- max_tr.entries = global_trace.entries;
mutex_unlock(&trace_types_lock);
return cnt;
@@ -3294,7 +3487,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
char *buf;
- char *end;
+ size_t written;
if (tracing_disabled)
return -EINVAL;
@@ -3302,7 +3495,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
- buf = kmalloc(cnt + 1, GFP_KERNEL);
+ buf = kmalloc(cnt + 2, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
@@ -3310,36 +3503,102 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
kfree(buf);
return -EFAULT;
}
+ if (buf[cnt-1] != '\n') {
+ buf[cnt] = '\n';
+ buf[cnt+1] = '\0';
+ } else
+ buf[cnt] = '\0';
- /* Cut from the first nil or newline. */
- buf[cnt] = '\0';
- end = strchr(buf, '\n');
- if (end)
- *end = '\0';
-
- cnt = mark_printk("%s\n", buf);
+ written = mark_printk("%s", buf);
kfree(buf);
+ *fpos += written;
+
+ /* don't tell userspace we wrote more - it might confuse them */
+ if (written > cnt)
+ written = cnt;
+
+ return written;
+}
+
+static int tracing_clock_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
+ seq_printf(m,
+ "%s%s%s%s", i ? " " : "",
+ i == trace_clock_id ? "[" : "", trace_clocks[i].name,
+ i == trace_clock_id ? "]" : "");
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ char buf[64];
+ const char *clockstr;
+ int i;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ clockstr = strstrip(buf);
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
+ if (strcmp(trace_clocks[i].name, clockstr) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(trace_clocks))
+ return -EINVAL;
+
+ trace_clock_id = i;
+
+ mutex_lock(&trace_types_lock);
+
+ ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
+ if (max_tr.buffer)
+ ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+
+ mutex_unlock(&trace_types_lock);
+
*fpos += cnt;
return cnt;
}
+static int tracing_clock_open(struct inode *inode, struct file *file)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+ return single_open(file, tracing_clock_show, NULL);
+}
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations tracing_ctrl_fops = {
.open = tracing_open_generic,
.read = tracing_ctrl_read,
.write = tracing_ctrl_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations tracing_pipe_fops = {
@@ -3348,17 +3607,28 @@ static const struct file_operations tracing_pipe_fops = {
.read = tracing_read_pipe,
.splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
+ .llseek = no_llseek,
};
static const struct file_operations tracing_entries_fops = {
.open = tracing_open_generic,
.read = tracing_entries_read,
.write = tracing_entries_write,
+ .llseek = generic_file_llseek,
};
static const struct file_operations tracing_mark_fops = {
.open = tracing_open_generic,
.write = tracing_mark_write,
+ .llseek = generic_file_llseek,
+};
+
+static const struct file_operations trace_clock_fops = {
+ .open = tracing_clock_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = tracing_clock_write,
};
struct ftrace_buffer_info {
@@ -3396,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
- unsigned int pos;
ssize_t ret;
size_t size;
@@ -3414,18 +3683,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
+ trace_access_lock(info->cpu);
ret = ring_buffer_read_page(info->tr->buffer,
&info->spare,
count,
info->cpu, 0);
+ trace_access_unlock(info->cpu);
if (ret < 0)
return 0;
- pos = ring_buffer_page_len(info->spare);
-
- if (pos < PAGE_SIZE)
- memset(info->spare + pos, 0, PAGE_SIZE - pos);
-
read:
size = PAGE_SIZE - info->read;
if (size > count)
@@ -3487,7 +3753,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
}
/* Pipe buffer operations for a buffer. */
-static struct pipe_buf_operations buffer_pipe_buf_ops = {
+static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.can_merge = 0,
.map = generic_pipe_buf_map,
.unmap = generic_pipe_buf_unmap,
@@ -3520,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
- struct partial_page partial[PIPE_BUFFERS];
- struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
+ .pages = pages_def,
+ .partial = partial_def,
.flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
@@ -3533,21 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int entries, size, i;
size_t ret;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
if (*ppos & (PAGE_SIZE - 1)) {
WARN_ONCE(1, "Ftrace: previous read must page-align\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (len & (PAGE_SIZE - 1)) {
WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
- if (len < PAGE_SIZE)
- return -EINVAL;
+ if (len < PAGE_SIZE) {
+ ret = -EINVAL;
+ goto out;
+ }
len &= PAGE_MASK;
}
+ trace_access_lock(info->cpu);
entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
- for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
@@ -3592,6 +3865,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
}
+ trace_access_unlock(info->cpu);
spd.nr_pages = i;
/* did we read anything? */
@@ -3601,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
else
ret = 0;
/* TODO: block */
- return ret;
+ goto out;
}
ret = splice_to_pipe(pipe, &spd);
-
+ splice_shrink_spd(pipe, &spd);
+out:
return ret;
}
@@ -3628,7 +3903,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
- return ENOMEM;
+ return -ENOMEM;
trace_seq_init(s);
@@ -3641,9 +3916,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
trace_seq_printf(s, "commit overrun: %ld\n", cnt);
- cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
- trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
-
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
kfree(s);
@@ -3654,6 +3926,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
static const struct file_operations tracing_stats_fops = {
.open = tracing_open_generic,
.read = tracing_stats_read,
+ .llseek = generic_file_llseek,
};
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -3690,6 +3963,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
static const struct file_operations tracing_dyn_info_fops = {
.open = tracing_open_generic,
.read = tracing_read_dyn_info,
+ .llseek = generic_file_llseek,
};
#endif
@@ -3746,13 +4020,9 @@ static void tracing_init_debugfs_percpu(long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu();
struct dentry *d_cpu;
- /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
- char cpu_dir[7];
-
- if (cpu > 999 || cpu < 0)
- return;
+ char cpu_dir[30]; /* 30 characters should be more than enough */
- sprintf(cpu_dir, "cpu%ld", cpu);
+ snprintf(cpu_dir, 30, "cpu%ld", cpu);
d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
@@ -3821,39 +4091,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret < 0)
return ret;
- ret = 0;
- switch (val) {
- case 0:
- /* do nothing if already cleared */
- if (!(topt->flags->val & topt->opt->bit))
- break;
-
- mutex_lock(&trace_types_lock);
- if (current_trace->set_flag)
- ret = current_trace->set_flag(topt->flags->val,
- topt->opt->bit, 0);
- mutex_unlock(&trace_types_lock);
- if (ret)
- return ret;
- topt->flags->val &= ~topt->opt->bit;
- break;
- case 1:
- /* do nothing if already set */
- if (topt->flags->val & topt->opt->bit)
- break;
+ if (val != 0 && val != 1)
+ return -EINVAL;
+ if (!!(topt->flags->val & topt->opt->bit) != val) {
mutex_lock(&trace_types_lock);
- if (current_trace->set_flag)
- ret = current_trace->set_flag(topt->flags->val,
- topt->opt->bit, 1);
+ ret = __set_tracer_option(current_trace, topt->flags,
+ topt->opt, !val);
mutex_unlock(&trace_types_lock);
if (ret)
return ret;
- topt->flags->val |= topt->opt->bit;
- break;
-
- default:
- return -EINVAL;
}
*ppos += cnt;
@@ -3866,6 +4113,7 @@ static const struct file_operations trace_options_fops = {
.open = tracing_open_generic,
.read = trace_options_read,
.write = trace_options_write,
+ .llseek = generic_file_llseek,
};
static ssize_t
@@ -3904,17 +4152,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret < 0)
return ret;
- switch (val) {
- case 0:
- trace_flags &= ~(1 << index);
- break;
- case 1:
- trace_flags |= 1 << index;
- break;
-
- default:
+ if (val != 0 && val != 1)
return -EINVAL;
- }
+ set_tracer_flags(1 << index, val);
*ppos += cnt;
@@ -3925,6 +4165,7 @@ static const struct file_operations trace_options_core_fops = {
.open = tracing_open_generic,
.read = trace_options_core_read,
.write = trace_options_core_write,
+ .llseek = generic_file_llseek,
};
struct dentry *trace_create_file(const char *name,
@@ -4062,6 +4303,8 @@ static __init int tracer_init_debugfs(void)
struct dentry *d_tracer;
int cpu;
+ trace_access_lock_init();
+
d_tracer = tracing_init_dentry();
trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4082,8 +4325,10 @@ static __init int tracer_init_debugfs(void)
trace_create_file("current_tracer", 0644, d_tracer,
&global_trace, &set_tracer_fops);
+#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tracing_max_latency, &tracing_max_lat_fops);
+#endif
trace_create_file("tracing_thresh", 0644, d_tracer,
&tracing_thresh, &tracing_max_lat_fops);
@@ -4103,13 +4348,13 @@ static __init int tracer_init_debugfs(void)
trace_create_file("saved_cmdlines", 0444, d_tracer,
NULL, &tracing_saved_cmdlines_fops);
+ trace_create_file("trace_clock", 0644, d_tracer, NULL,
+ &trace_clock_fops);
+
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
-#ifdef CONFIG_SYSPROF_TRACER
- init_tracer_sysprof_debugfs(d_tracer);
-#endif
create_trace_options_dir();
@@ -4123,7 +4368,7 @@ static int trace_panic_handler(struct notifier_block *this,
unsigned long event, void *unused)
{
if (ftrace_dump_on_oops)
- ftrace_dump();
+ ftrace_dump(ftrace_dump_on_oops);
return NOTIFY_OK;
}
@@ -4140,7 +4385,7 @@ static int trace_die_handler(struct notifier_block *self,
switch (val) {
case DIE_OOPS:
if (ftrace_dump_on_oops)
- ftrace_dump();
+ ftrace_dump(ftrace_dump_on_oops);
break;
default:
break;
@@ -4166,7 +4411,7 @@ static struct notifier_block trace_die_notifier = {
*/
#define KERN_TRACE KERN_EMERG
-static void
+void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
@@ -4181,10 +4426,18 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
-static void __ftrace_dump(bool disable_tracing)
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ iter->tr = &global_trace;
+ iter->trace = current_trace;
+ iter->cpu_file = TRACE_PIPE_ALL_CPU;
+}
+
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
{
- static raw_spinlock_t ftrace_dump_lock =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ static arch_spinlock_t ftrace_dump_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
unsigned int old_userobj;
@@ -4194,7 +4447,7 @@ static void __ftrace_dump(bool disable_tracing)
/* only one dump */
local_irq_save(flags);
- __raw_spin_lock(&ftrace_dump_lock);
+ arch_spin_lock(&ftrace_dump_lock);
if (dump_ran)
goto out;
@@ -4205,8 +4458,10 @@ static void __ftrace_dump(bool disable_tracing)
if (disable_tracing)
ftrace_kill();
+ trace_init_global_iter(&iter);
+
for_each_tracing_cpu(cpu) {
- atomic_inc(&global_trace.data[cpu]->disabled);
+ atomic_inc(&iter.tr->data[cpu]->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4214,12 +4469,25 @@ static void __ftrace_dump(bool disable_tracing)
/* don't look at user memory in panic mode */
trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- printk(KERN_TRACE "Dumping ftrace buffer:\n");
-
/* Simulate the iterator */
iter.tr = &global_trace;
iter.trace = current_trace;
- iter.cpu_file = TRACE_PIPE_ALL_CPU;
+
+ switch (oops_dump_mode) {
+ case DUMP_ALL:
+ iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ break;
+ case DUMP_ORIG:
+ iter.cpu_file = raw_smp_processor_id();
+ break;
+ case DUMP_NONE:
+ goto out_enable;
+ default:
+ printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ }
+
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
/*
* We need to stop all tracing on all CPUS to read the
@@ -4242,9 +4510,12 @@ static void __ftrace_dump(bool disable_tracing)
iter.iter_flags |= TRACE_FILE_LAT_FMT;
iter.pos = -1;
- if (find_next_entry_inc(&iter) != NULL) {
- print_trace_line(&iter);
- trace_consume(&iter);
+ if (trace_find_next_entry_inc(&iter) != NULL) {
+ int ret;
+
+ ret = print_trace_line(&iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(&iter);
}
trace_printk_seq(&iter.seq);
@@ -4255,30 +4526,30 @@ static void __ftrace_dump(bool disable_tracing)
else
printk(KERN_TRACE "---------------------------------\n");
+ out_enable:
/* Re-enable tracing if requested */
if (!disable_tracing) {
trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&global_trace.data[cpu]->disabled);
+ atomic_dec(&iter.tr->data[cpu]->disabled);
}
tracing_on();
}
out:
- __raw_spin_unlock(&ftrace_dump_lock);
+ arch_spin_unlock(&ftrace_dump_lock);
local_irq_restore(flags);
}
/* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- __ftrace_dump(true);
+ __ftrace_dump(true, oops_dump_mode);
}
__init static int tracer_alloc_buffers(void)
{
- struct trace_array_cpu *data;
int ring_buf_size;
int i;
int ret = -ENOMEM;
@@ -4289,9 +4560,6 @@ __init static int tracer_alloc_buffers(void)
if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
- if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
- goto out_free_tracing_cpumask;
-
/* To save memory, keep the ring buffer size to its minimum */
if (ring_buffer_expanded)
ring_buf_size = trace_buf_size;
@@ -4300,7 +4568,6 @@ __init static int tracer_alloc_buffers(void)
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
cpumask_copy(tracing_cpumask, cpu_all_mask);
- cpumask_clear(tracing_reader_cpumask);
/* TODO: make the number of buffers hot pluggable with CPUS */
global_trace.buffer = ring_buffer_alloc(ring_buf_size,
@@ -4314,31 +4581,26 @@ __init static int tracer_alloc_buffers(void)
#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(ring_buf_size,
- TRACE_BUFFER_FLAGS);
+ max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
if (!max_tr.buffer) {
printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
WARN_ON(1);
ring_buffer_free(global_trace.buffer);
goto out_free_cpumask;
}
- max_tr.entries = ring_buffer_size(max_tr.buffer);
- WARN_ON(max_tr.entries != global_trace.entries);
+ max_tr.entries = 1;
#endif
/* Allocate the first page for all buffers */
for_each_tracing_cpu(i) {
- data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
- max_tr.data[i] = &per_cpu(max_data, i);
+ global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+ max_tr.data[i] = &per_cpu(max_tr_data, i);
}
trace_init_cmdlines();
register_tracer(&nop_trace);
current_trace = &nop_trace;
-#ifdef CONFIG_BOOT_TRACER
- register_tracer(&boot_tracer);
-#endif
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -4350,8 +4612,6 @@ __init static int tracer_alloc_buffers(void)
return 0;
out_free_cpumask:
- free_cpumask_var(tracing_reader_cpumask);
-out_free_tracing_cpumask:
free_cpumask_var(tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6e735d4771f..9021f8c0c0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,11 +7,9 @@
#include <linux/clocksource.h>
#include <linux/ring_buffer.h>
#include <linux/mmiotrace.h>
+#include <linux/tracepoint.h>
#include <linux/ftrace.h>
-#include <trace/boot.h>
-#include <linux/kmemtrace.h>
-#include <trace/power.h>
-
+#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
@@ -24,177 +22,58 @@ enum trace_type {
TRACE_STACK,
TRACE_PRINT,
TRACE_BPRINT,
- TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
TRACE_BRANCH,
- TRACE_BOOT_CALL,
- TRACE_BOOT_RET,
TRACE_GRAPH_RET,
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
- TRACE_HW_BRANCHES,
- TRACE_SYSCALL_ENTER,
- TRACE_SYSCALL_EXIT,
- TRACE_KMEM_ALLOC,
- TRACE_KMEM_FREE,
- TRACE_POWER,
TRACE_BLK,
__TRACE_LAST_TYPE,
};
-/*
- * Function trace entry - function address and parent function addres:
- */
-struct ftrace_entry {
- struct trace_entry ent;
- unsigned long ip;
- unsigned long parent_ip;
-};
-
-/* Function call entry */
-struct ftrace_graph_ent_entry {
- struct trace_entry ent;
- struct ftrace_graph_ent graph_ent;
-};
-
-/* Function return entry */
-struct ftrace_graph_ret_entry {
- struct trace_entry ent;
- struct ftrace_graph_ret ret;
-};
-extern struct tracer boot_tracer;
-
-/*
- * Context switch trace entry - which task (and prio) we switched from/to:
- */
-struct ctx_switch_entry {
- struct trace_entry ent;
- unsigned int prev_pid;
- unsigned char prev_prio;
- unsigned char prev_state;
- unsigned int next_pid;
- unsigned char next_prio;
- unsigned char next_state;
- unsigned int next_cpu;
-};
-
-/*
- * Special (free-form) trace entry:
- */
-struct special_entry {
- struct trace_entry ent;
- unsigned long arg1;
- unsigned long arg2;
- unsigned long arg3;
-};
-
-/*
- * Stack-trace entry:
- */
-
-#define FTRACE_STACK_ENTRIES 8
-
-struct stack_entry {
- struct trace_entry ent;
- unsigned long caller[FTRACE_STACK_ENTRIES];
-};
-
-struct userstack_entry {
- struct trace_entry ent;
- unsigned long caller[FTRACE_STACK_ENTRIES];
-};
-
-/*
- * trace_printk entry:
- */
-struct bprint_entry {
- struct trace_entry ent;
- unsigned long ip;
- const char *fmt;
- u32 buf[];
-};
-
-struct print_entry {
- struct trace_entry ent;
- unsigned long ip;
- char buf[];
-};
-
-#define TRACE_OLD_SIZE 88
-struct trace_field_cont {
- unsigned char type;
- /* Temporary till we get rid of this completely */
- char buf[TRACE_OLD_SIZE - 1];
-};
+#undef __field
+#define __field(type, item) type item;
-struct trace_mmiotrace_rw {
- struct trace_entry ent;
- struct mmiotrace_rw rw;
-};
+#undef __field_struct
+#define __field_struct(type, item) __field(type, item)
-struct trace_mmiotrace_map {
- struct trace_entry ent;
- struct mmiotrace_map map;
-};
+#undef __field_desc
+#define __field_desc(type, container, item)
-struct trace_boot_call {
- struct trace_entry ent;
- struct boot_trace_call boot_call;
-};
+#undef __array
+#define __array(type, item, size) type item[size];
-struct trace_boot_ret {
- struct trace_entry ent;
- struct boot_trace_ret boot_ret;
-};
+#undef __array_desc
+#define __array_desc(type, container, item, size)
-#define TRACE_FUNC_SIZE 30
-#define TRACE_FILE_SIZE 20
-struct trace_branch {
- struct trace_entry ent;
- unsigned line;
- char func[TRACE_FUNC_SIZE+1];
- char file[TRACE_FILE_SIZE+1];
- char correct;
-};
+#undef __dynamic_array
+#define __dynamic_array(type, item) type item[];
-struct hw_branch_entry {
- struct trace_entry ent;
- u64 from;
- u64 to;
-};
+#undef F_STRUCT
+#define F_STRUCT(args...) args
-struct trace_power {
- struct trace_entry ent;
- struct power_trace state_data;
-};
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
+ struct struct_name { \
+ struct trace_entry ent; \
+ tstruct \
+ }
-enum kmemtrace_type_id {
- KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
- KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
- KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
-};
+#undef TP_ARGS
+#define TP_ARGS(args...) args
-struct kmemtrace_alloc_entry {
- struct trace_entry ent;
- enum kmemtrace_type_id type_id;
- unsigned long call_site;
- const void *ptr;
- size_t bytes_req;
- size_t bytes_alloc;
- gfp_t gfp_flags;
- int node;
-};
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
-struct kmemtrace_free_entry {
- struct trace_entry ent;
- enum kmemtrace_type_id type_id;
- unsigned long call_site;
- const void *ptr;
-};
+#include "trace_entries.h"
+/*
+ * syscalls are special, and need special handling, this is why
+ * they are not included in trace_entries.h
+ */
struct syscall_trace_enter {
struct trace_entry ent;
int nr;
@@ -204,16 +83,26 @@ struct syscall_trace_enter {
struct syscall_trace_exit {
struct trace_entry ent;
int nr;
- unsigned long ret;
+ long ret;
};
+struct kprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long ip;
+};
+
+struct kretprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long func;
+ unsigned long ret_ip;
+};
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
* IRQS_OFF - interrupts were disabled
* IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
- * NEED_RESCED - reschedule is requested
+ * NEED_RESCHED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
*/
@@ -236,9 +125,6 @@ struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
- /* these fields get copied into max-trace: */
- unsigned long trace_idx;
- unsigned long overrun;
unsigned long saved_latency;
unsigned long critical_start;
unsigned long critical_end;
@@ -246,6 +132,7 @@ struct trace_array_cpu {
unsigned long nice;
unsigned long policy;
unsigned long rt_priority;
+ unsigned long skipped_entries;
cycle_t preempt_timestamp;
pid_t pid;
uid_t uid;
@@ -301,28 +188,15 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
- IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
TRACE_MMIO_MAP); \
- IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
- IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
TRACE_GRAPH_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
- IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
- IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
- TRACE_KMEM_ALLOC); \
- IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
- TRACE_KMEM_FREE); \
- IF_ASSIGN(var, ent, struct syscall_trace_enter, \
- TRACE_SYSCALL_ENTER); \
- IF_ASSIGN(var, ent, struct syscall_trace_exit, \
- TRACE_SYSCALL_EXIT); \
__ftrace_bad_type(); \
} while (0)
@@ -360,6 +234,7 @@ struct tracer_flags {
* @pipe_open: called when the trace_pipe file is opened
* @wait_pipe: override how the user waits for traces on trace_pipe
* @close: called when the trace file is released
+ * @pipe_close: called when the trace_pipe file is released
* @read: override the default read callback on trace_pipe
* @splice_read: override the default splice_read callback on trace_pipe
* @selftest: selftest to run on boot (see trace_selftest.c)
@@ -378,6 +253,7 @@ struct tracer {
void (*pipe_open)(struct trace_iterator *iter);
void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
+ void (*pipe_close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos);
@@ -398,7 +274,7 @@ struct tracer {
struct tracer *next;
int print_max;
struct tracer_flags *flags;
- struct tracer_stat *stats;
+ int use_max_tr;
};
@@ -419,16 +295,16 @@ struct dentry *trace_create_file(const char *name,
const struct file_operations *fops);
struct dentry *tracing_init_dentry(void);
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
struct ring_buffer_event;
-struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
- int type,
- unsigned long len,
- unsigned long flags,
- int pc);
-void trace_buffer_unlock_commit(struct trace_array *tr,
+struct ring_buffer_event *
+trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags,
+ int pc);
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc);
@@ -438,9 +314,13 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
-void tracing_generic_entry_update(struct trace_entry *entry,
- unsigned long flags,
- int pc);
+int trace_empty(struct trace_iterator *iter);
+
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+
+void trace_init_global_iter(struct trace_iterator *iter);
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
@@ -459,18 +339,21 @@ void tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *wakee,
struct task_struct *cur,
unsigned long flags, int pc);
-void trace_special(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3, int pc);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+ unsigned long ip,
+ unsigned long parent_ip,
+ unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
+void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -479,35 +362,56 @@ void tracing_stop_sched_switch_record(void);
void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type);
+int is_tracing_stopped(void);
+enum trace_file_type {
+ TRACE_FILE_LAT_FMT = 1,
+ TRACE_FILE_ANNOTATE = 2,
+};
+
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu) \
+ for_each_cpu(cpu, tracing_buffer_mask)
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
-extern unsigned long tracing_max_latency;
extern unsigned long tracing_thresh;
+#ifdef CONFIG_TRACER_MAX_TRACE
+extern unsigned long tracing_max_latency;
+
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
+#endif /* CONFIG_TRACER_MAX_TRACE */
-void __trace_stack(struct trace_array *tr,
- unsigned long flags,
- int skip, int pc);
+#ifdef CONFIG_STACKTRACE
+void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc);
-extern cycle_t ftrace_now(int cpu);
+void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
+ int pc);
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-typedef void
-(*tracer_switch_func_t)(void *private,
- void *__rq,
- struct task_struct *prev,
- struct task_struct *next);
-
-struct tracer_switch_ops {
- tracer_switch_func_t func;
- void *private;
- struct tracer_switch_ops *next;
-};
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc);
+#else
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags, int skip, int pc)
+{
+}
+
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc)
+{
+}
+
+static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
+ int skip, int pc)
+{
+}
+#endif /* CONFIG_STACKTRACE */
+
+extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
@@ -517,6 +421,10 @@ extern unsigned long ftrace_update_tot_cnt;
extern int DYN_FTRACE_TEST_NAME(void);
#endif
+extern int ring_buffer_expanded;
+extern bool tracing_selftest_disabled;
+DECLARE_PER_CPU(int, ftrace_cpu_disabled);
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
@@ -534,12 +442,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_sched_switch(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_sysprof(struct tracer *trace,
- struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
- struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
@@ -548,18 +452,48 @@ extern int
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
+extern int trace_clock_id;
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN 0x1
+#define TRACE_GRAPH_PRINT_CPU 0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
+#define TRACE_GRAPH_PRINT_PROC 0x8
+#define TRACE_GRAPH_PRINT_DURATION 0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
extern enum print_line_t
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags, int pc);
+
#ifdef CONFIG_DYNAMIC_FTRACE
/* TODO: make this variable */
#define FTRACE_GRAPH_MAX_FUNCS 32
+extern int ftrace_graph_filter_enabled;
extern int ftrace_graph_count;
extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
@@ -567,7 +501,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
{
int i;
- if (!ftrace_graph_count || test_tsk_trace_graph(current))
+ if (!ftrace_graph_filter_enabled)
return 1;
for (i = 0; i < ftrace_graph_count; i++) {
@@ -578,10 +512,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
return 0;
}
#else
-static inline int ftrace_trace_addr(unsigned long addr)
-{
- return 1;
-}
static inline int ftrace_graph_addr(unsigned long addr)
{
return 1;
@@ -589,21 +519,63 @@ static inline int ftrace_graph_addr(unsigned long addr)
#endif /* CONFIG_DYNAMIC_FTRACE */
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
return TRACE_TYPE_UNHANDLED;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-extern struct pid *ftrace_pid_trace;
+extern struct list_head ftrace_pids;
+#ifdef CONFIG_FUNCTION_TRACER
static inline int ftrace_trace_task(struct task_struct *task)
{
- if (!ftrace_pid_trace)
+ if (list_empty(&ftrace_pids))
return 1;
return test_tsk_trace_trace(task);
}
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+ return 1;
+}
+#endif
+
+/*
+ * struct trace_parser - servers for reading the user input separated by spaces
+ * @cont: set if the input is not complete - no final space char was found
+ * @buffer: holds the parsed user input
+ * @idx: user input length
+ * @size: buffer size
+ */
+struct trace_parser {
+ bool cont;
+ char *buffer;
+ unsigned idx;
+ unsigned size;
+};
+
+static inline bool trace_parser_loaded(struct trace_parser *parser)
+{
+ return (parser->idx != 0);
+}
+
+static inline bool trace_parser_cont(struct trace_parser *parser)
+{
+ return parser->cont;
+}
+
+static inline void trace_parser_clear(struct trace_parser *parser)
+{
+ parser->cont = false;
+ parser->idx = 0;
+}
+
+extern int trace_parser_get_init(struct trace_parser *parser, int size);
+extern void trace_parser_put(struct trace_parser *parser);
+extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
+ size_t cnt, loff_t *ppos);
/*
* trace_iterator_flags is an enumeration that defines bit
@@ -622,19 +594,18 @@ enum trace_iterator_flags {
TRACE_ITER_BIN = 0x40,
TRACE_ITER_BLOCK = 0x80,
TRACE_ITER_STACKTRACE = 0x100,
- TRACE_ITER_SCHED_TREE = 0x200,
- TRACE_ITER_PRINTK = 0x400,
- TRACE_ITER_PREEMPTONLY = 0x800,
- TRACE_ITER_BRANCH = 0x1000,
- TRACE_ITER_ANNOTATE = 0x2000,
- TRACE_ITER_USERSTACKTRACE = 0x4000,
- TRACE_ITER_SYM_USEROBJ = 0x8000,
- TRACE_ITER_PRINTK_MSGONLY = 0x10000,
- TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
- TRACE_ITER_LATENCY_FMT = 0x40000,
- TRACE_ITER_GLOBAL_CLK = 0x80000,
- TRACE_ITER_SLEEP_TIME = 0x100000,
- TRACE_ITER_GRAPH_TIME = 0x200000,
+ TRACE_ITER_PRINTK = 0x200,
+ TRACE_ITER_PREEMPTONLY = 0x400,
+ TRACE_ITER_BRANCH = 0x800,
+ TRACE_ITER_ANNOTATE = 0x1000,
+ TRACE_ITER_USERSTACKTRACE = 0x2000,
+ TRACE_ITER_SYM_USEROBJ = 0x4000,
+ TRACE_ITER_PRINTK_MSGONLY = 0x8000,
+ TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
+ TRACE_ITER_LATENCY_FMT = 0x20000,
+ TRACE_ITER_SLEEP_TIME = 0x40000,
+ TRACE_ITER_GRAPH_TIME = 0x80000,
+ TRACE_ITER_RECORD_CMD = 0x100000,
};
/*
@@ -646,54 +617,6 @@ enum trace_iterator_flags {
extern struct tracer nop_trace;
-/**
- * ftrace_preempt_disable - disable preemption scheduler safe
- *
- * When tracing can happen inside the scheduler, there exists
- * cases that the tracing might happen before the need_resched
- * flag is checked. If this happens and the tracer calls
- * preempt_enable (after a disable), a schedule might take place
- * causing an infinite recursion.
- *
- * To prevent this, we read the need_resched flag before
- * disabling preemption. When we want to enable preemption we
- * check the flag, if it is set, then we call preempt_enable_no_resched.
- * Otherwise, we call preempt_enable.
- *
- * The rational for doing the above is that if need_resched is set
- * and we have yet to reschedule, we are either in an atomic location
- * (where we do not need to check for scheduling) or we are inside
- * the scheduler and do not want to resched.
- */
-static inline int ftrace_preempt_disable(void)
-{
- int resched;
-
- resched = need_resched();
- preempt_disable_notrace();
-
- return resched;
-}
-
-/**
- * ftrace_preempt_enable - enable preemption scheduler safe
- * @resched: the return value from ftrace_preempt_disable
- *
- * This is a scheduler safe way to enable preemption and not miss
- * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we are either inside an atomic or
- * are inside the scheduler (we would have already scheduled
- * otherwise). In this case, we do not want to call normal
- * preempt_enable, but preempt_enable_no_resched instead.
- */
-static inline void ftrace_preempt_enable(int resched)
-{
- if (resched)
- preempt_enable_no_resched_notrace();
- else
- preempt_enable_notrace();
-}
-
#ifdef CONFIG_BRANCH_TRACER
extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
@@ -731,6 +654,7 @@ struct ftrace_event_field {
struct list_head link;
char *name;
char *type;
+ int filter_type;
int offset;
int size;
int is_signed;
@@ -746,26 +670,47 @@ struct event_subsystem {
struct list_head list;
const char *name;
struct dentry *entry;
- void *filter;
+ struct event_filter *filter;
+ int nr_events;
};
struct filter_pred;
+struct regex;
typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
int val1, int val2);
+typedef int (*regex_match_func)(char *str, struct regex *r, int len);
+
+enum regex_type {
+ MATCH_FULL = 0,
+ MATCH_FRONT_ONLY,
+ MATCH_MIDDLE_ONLY,
+ MATCH_END_ONLY,
+};
+
+struct regex {
+ char pattern[MAX_FILTER_STR_VAL];
+ int len;
+ int field_len;
+ regex_match_func match;
+};
+
struct filter_pred {
- filter_pred_fn_t fn;
- u64 val;
- char str_val[MAX_FILTER_STR_VAL];
- int str_len;
- char *field_name;
- int offset;
- int not;
- int op;
- int pop_n;
+ filter_pred_fn_t fn;
+ u64 val;
+ struct regex regex;
+ char *field_name;
+ int offset;
+ int not;
+ int op;
+ int pop_n;
};
+extern struct list_head ftrace_common_fields;
+
+extern enum regex_type
+filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
extern int apply_event_filter(struct ftrace_event_call *call,
@@ -774,13 +719,18 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
+extern int filter_assign_type(const char *type);
+
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
static inline int
filter_check_discard(struct ftrace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(call->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
}
@@ -788,46 +738,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
return 0;
}
-#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_##type(struct filter_pred *pred, void *event, \
- int val1, int val2) \
-{ \
- type *addr = (type *)(event + pred->offset); \
- type val = (type)pred->val; \
- int match = 0; \
- \
- switch (pred->op) { \
- case OP_LT: \
- match = (*addr < val); \
- break; \
- case OP_LE: \
- match = (*addr <= val); \
- break; \
- case OP_GT: \
- match = (*addr > val); \
- break; \
- case OP_GE: \
- match = (*addr >= val); \
- break; \
- default: \
- break; \
- } \
- \
- return match; \
-}
-
-#define DEFINE_EQUALITY_PRED(size) \
-static int filter_pred_##size(struct filter_pred *pred, void *event, \
- int val1, int val2) \
-{ \
- u##size *addr = (u##size *)(event + pred->offset); \
- u##size val = (u##size)pred->val; \
- int match; \
- \
- match = (val == *addr) ^ pred->not; \
- \
- return match; \
-}
+extern void trace_event_enable_cmd_record(bool enable);
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
@@ -835,11 +746,13 @@ extern struct list_head ftrace_events;
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
- extern struct ftrace_event_call event_##call;
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
-#include "trace_event_types.h"
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
+ extern struct ftrace_event_call \
+ __attribute__((__aligned__(4))) event_##call;
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
+#include "trace_entries.h"
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
deleted file mode 100644
index a29ef23ffb4..00000000000
--- a/kernel/trace/trace_boot.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * ring buffer based initcalls tracer
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/time.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *boot_trace;
-static bool pre_initcalls_finished;
-
-/* Tells the boot tracer that the pre_smp_initcalls are finished.
- * So we are ready .
- * It doesn't enable sched events tracing however.
- * You have to call enable_boot_trace to do so.
- */
-void start_boot_trace(void)
-{
- pre_initcalls_finished = true;
-}
-
-void enable_boot_trace(void)
-{
- if (boot_trace && pre_initcalls_finished)
- tracing_start_sched_switch_record();
-}
-
-void disable_boot_trace(void)
-{
- if (boot_trace && pre_initcalls_finished)
- tracing_stop_sched_switch_record();
-}
-
-static int boot_trace_init(struct trace_array *tr)
-{
- int cpu;
- boot_trace = tr;
-
- if (!tr)
- return 0;
-
- for_each_cpu(cpu, cpu_possible_mask)
- tracing_reset(tr, cpu);
-
- tracing_sched_switch_assign_trace(tr);
- return 0;
-}
-
-static enum print_line_t
-initcall_call_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
- struct trace_seq *s = &iter->seq;
- struct trace_boot_call *field;
- struct boot_trace_call *call;
- u64 ts;
- unsigned long nsec_rem;
- int ret;
-
- trace_assign_type(field, entry);
- call = &field->boot_call;
- ts = iter->ts;
- nsec_rem = do_div(ts, NSEC_PER_SEC);
-
- ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
- (unsigned long)ts, nsec_rem, call->func, call->caller);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- else
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-initcall_ret_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
- struct trace_seq *s = &iter->seq;
- struct trace_boot_ret *field;
- struct boot_trace_ret *init_ret;
- u64 ts;
- unsigned long nsec_rem;
- int ret;
-
- trace_assign_type(field, entry);
- init_ret = &field->boot_ret;
- ts = iter->ts;
- nsec_rem = do_div(ts, NSEC_PER_SEC);
-
- ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
- "returned %d after %llu msecs\n",
- (unsigned long) ts,
- nsec_rem,
- init_ret->func, init_ret->result, init_ret->duration);
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- else
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
-{
- struct trace_entry *entry = iter->ent;
-
- switch (entry->type) {
- case TRACE_BOOT_CALL:
- return initcall_call_print_line(iter);
- case TRACE_BOOT_RET:
- return initcall_ret_print_line(iter);
- default:
- return TRACE_TYPE_UNHANDLED;
- }
-}
-
-struct tracer boot_tracer __read_mostly =
-{
- .name = "initcall",
- .init = boot_trace_init,
- .reset = tracing_reset_online_cpus,
- .print_line = initcall_print_line,
-};
-
-void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
-{
- struct ring_buffer_event *event;
- struct trace_boot_call *entry;
- struct trace_array *tr = boot_trace;
-
- if (!tr || !pre_initcalls_finished)
- return;
-
- /* Get its name now since this function could
- * disappear because it is in the .init section.
- */
- sprint_symbol(bt->func, (unsigned long)fn);
- preempt_disable();
-
- event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->boot_call = *bt;
- trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
- preempt_enable();
-}
-
-void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
-{
- struct ring_buffer_event *event;
- struct trace_boot_ret *entry;
- struct trace_array *tr = boot_trace;
-
- if (!tr || !pre_initcalls_finished)
- return;
-
- sprint_symbol(bt->func, (unsigned long)fn);
- preempt_disable();
-
- event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->boot_ret = *bt;
- trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
- preempt_enable();
-}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a..8d3538b4ea5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
struct trace_array *tr = branch_tracer;
struct ring_buffer_event *event;
struct trace_branch *entry;
+ struct ring_buffer *buffer;
unsigned long flags;
int cpu, pc;
const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
goto out;
pc = preempt_count();
- event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
- if (!filter_check_discard(call, entry, tr->buffer, event))
- ring_buffer_unlock_commit(tr->buffer, event);
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
out:
atomic_dec(&tr->data[cpu]->disabled);
@@ -141,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
}
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct trace_branch *field;
@@ -165,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
" |\n");
}
+static struct trace_event_functions trace_branch_funcs = {
+ .trace = trace_branch_print,
+};
+
static struct trace_event trace_branch_event = {
.type = TRACE_BRANCH,
- .trace = trace_branch_print,
+ .funcs = &trace_branch_funcs,
};
static struct tracer branch_trace __read_mostly =
@@ -305,8 +311,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
return -1;
if (percent_a > percent_b)
return 1;
- else
- return 0;
+
+ if (a->incorrect < b->incorrect)
+ return -1;
+ if (a->incorrect > b->incorrect)
+ return 1;
+
+ /*
+ * Since the above shows worse (incorrect) cases
+ * first, we continue that by showing best (correct)
+ * cases last.
+ */
+ if (a->correct > b->correct)
+ return -1;
+ if (a->correct < b->correct)
+ return 1;
+
+ return 0;
}
static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f..685a67d55db 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
* Tracer plugins will chose a default from these clocks.
*/
#include <linux/spinlock.h>
+#include <linux/irqflags.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -20,6 +21,8 @@
#include <linux/ktime.h>
#include <linux/trace_clock.h>
+#include "trace.h"
+
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
@@ -28,7 +31,6 @@
*/
u64 notrace trace_clock_local(void)
{
- unsigned long flags;
u64 clock;
/*
@@ -36,9 +38,9 @@ u64 notrace trace_clock_local(void)
* lockless clock. It is not guaranteed to be coherent across
* CPUs, nor across CPU idle events.
*/
- raw_local_irq_save(flags);
+ preempt_disable_notrace();
clock = sched_clock();
- raw_local_irq_restore(flags);
+ preempt_enable_notrace();
return clock;
}
@@ -53,7 +55,7 @@ u64 notrace trace_clock_local(void)
*/
u64 notrace trace_clock(void)
{
- return cpu_clock(raw_smp_processor_id());
+ return local_clock();
}
@@ -66,10 +68,14 @@ u64 notrace trace_clock(void)
* Used by plugins that need globally coherent timestamps.
*/
-static u64 prev_trace_clock_time;
-
-static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+/* keep prev_time and lock in the same cacheline. */
+static struct {
+ u64 prev_time;
+ arch_spinlock_t lock;
+} trace_clock_struct ____cacheline_aligned_in_smp =
+ {
+ .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
+ };
u64 notrace trace_clock_global(void)
{
@@ -77,7 +83,7 @@ u64 notrace trace_clock_global(void)
int this_cpu;
u64 now;
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = raw_smp_processor_id();
now = cpu_clock(this_cpu);
@@ -88,22 +94,22 @@ u64 notrace trace_clock_global(void)
if (unlikely(in_nmi()))
goto out;
- __raw_spin_lock(&trace_clock_lock);
+ arch_spin_lock(&trace_clock_struct.lock);
/*
* TODO: if this happens often then maybe we should reset
- * my_scd->clock to prev_trace_clock_time+1, to make sure
+ * my_scd->clock to prev_time+1, to make sure
* we start ticking with the local clock from now on?
*/
- if ((s64)(now - prev_trace_clock_time) < 0)
- now = prev_trace_clock_time + 1;
+ if ((s64)(now - trace_clock_struct.prev_time) < 0)
+ now = trace_clock_struct.prev_time + 1;
- prev_trace_clock_time = now;
+ trace_clock_struct.prev_time = now;
- __raw_spin_unlock(&trace_clock_lock);
+ arch_spin_unlock(&trace_clock_struct.lock);
out:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);
return now;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 00000000000..e3dfecaf13e
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,276 @@
+/*
+ * This file defines the trace event structures that go into the ring
+ * buffer directly. They are created via macros so that changes for them
+ * appear in the format file. Using macros will automate this process.
+ *
+ * The macro used to create a ftrace data structure is:
+ *
+ * FTRACE_ENTRY( name, struct_name, id, structure, print )
+ *
+ * @name: the name used the event name, as well as the name of
+ * the directory that holds the format file.
+ *
+ * @struct_name: the name of the structure that is created.
+ *
+ * @id: The event identifier that is used to detect what event
+ * this is from the ring buffer.
+ *
+ * @structure: the structure layout
+ *
+ * - __field( type, item )
+ * This is equivalent to declaring
+ * type item;
+ * in the structure.
+ * - __array( type, item, size )
+ * This is equivalent to declaring
+ * type item[size];
+ * in the structure.
+ *
+ * * for structures within structures, the format of the internal
+ * structure is layed out. This allows the internal structure
+ * to be deciphered for the format file. Although these macros
+ * may become out of sync with the internal structure, they
+ * will create a compile error if it happens. Since the
+ * internel structures are just tracing helpers, this is not
+ * an issue.
+ *
+ * When an internal structure is used, it should use:
+ *
+ * __field_struct( type, item )
+ *
+ * instead of __field. This will prevent it from being shown in
+ * the output file. The fields in the structure should use.
+ *
+ * __field_desc( type, container, item )
+ * __array_desc( type, container, item, len )
+ *
+ * type, item and len are the same as __field and __array, but
+ * container is added. This is the name of the item in
+ * __field_struct that this is describing.
+ *
+ *
+ * @print: the print format shown to users in the format file.
+ */
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+FTRACE_ENTRY(function, ftrace_entry,
+
+ TRACE_FN,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( unsigned long, parent_ip )
+ ),
+
+ F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
+);
+
+/* Function call entry */
+FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+
+ TRACE_GRAPH_ENT,
+
+ F_STRUCT(
+ __field_struct( struct ftrace_graph_ent, graph_ent )
+ __field_desc( unsigned long, graph_ent, func )
+ __field_desc( int, graph_ent, depth )
+ ),
+
+ F_printk("--> %lx (%d)", __entry->func, __entry->depth)
+);
+
+/* Function return entry */
+FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+
+ TRACE_GRAPH_RET,
+
+ F_STRUCT(
+ __field_struct( struct ftrace_graph_ret, ret )
+ __field_desc( unsigned long, ret, func )
+ __field_desc( unsigned long long, ret, calltime)
+ __field_desc( unsigned long long, ret, rettime )
+ __field_desc( unsigned long, ret, overrun )
+ __field_desc( int, ret, depth )
+ ),
+
+ F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
+ __entry->func, __entry->depth,
+ __entry->calltime, __entry->rettime,
+ __entry->depth)
+);
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ *
+ * This is used for both wakeup and context switches. We only want
+ * to create one structure, but we need two outputs for it.
+ */
+#define FTRACE_CTX_FIELDS \
+ __field( unsigned int, prev_pid ) \
+ __field( unsigned char, prev_prio ) \
+ __field( unsigned char, prev_state ) \
+ __field( unsigned int, next_pid ) \
+ __field( unsigned char, next_prio ) \
+ __field( unsigned char, next_state ) \
+ __field( unsigned int, next_cpu )
+
+FTRACE_ENTRY(context_switch, ctx_switch_entry,
+
+ TRACE_CTX,
+
+ F_STRUCT(
+ FTRACE_CTX_FIELDS
+ ),
+
+ F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
+ __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
+ __entry->next_pid, __entry->next_prio, __entry->next_state,
+ __entry->next_cpu
+ )
+);
+
+/*
+ * FTRACE_ENTRY_DUP only creates the format file, it will not
+ * create another structure.
+ */
+FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
+
+ TRACE_WAKE,
+
+ F_STRUCT(
+ FTRACE_CTX_FIELDS
+ ),
+
+ F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
+ __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
+ __entry->next_pid, __entry->next_prio, __entry->next_state,
+ __entry->next_cpu
+ )
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES 8
+
+FTRACE_ENTRY(kernel_stack, stack_entry,
+
+ TRACE_STACK,
+
+ F_STRUCT(
+ __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
+ ),
+
+ F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+ "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+ __entry->caller[0], __entry->caller[1], __entry->caller[2],
+ __entry->caller[3], __entry->caller[4], __entry->caller[5],
+ __entry->caller[6], __entry->caller[7])
+);
+
+FTRACE_ENTRY(user_stack, userstack_entry,
+
+ TRACE_USER_STACK,
+
+ F_STRUCT(
+ __field( unsigned int, tgid )
+ __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
+ ),
+
+ F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+ "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+ __entry->caller[0], __entry->caller[1], __entry->caller[2],
+ __entry->caller[3], __entry->caller[4], __entry->caller[5],
+ __entry->caller[6], __entry->caller[7])
+);
+
+/*
+ * trace_printk entry:
+ */
+FTRACE_ENTRY(bprint, bprint_entry,
+
+ TRACE_BPRINT,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( const char *, fmt )
+ __dynamic_array( u32, buf )
+ ),
+
+ F_printk("%08lx fmt:%p",
+ __entry->ip, __entry->fmt)
+);
+
+FTRACE_ENTRY(print, print_entry,
+
+ TRACE_PRINT,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __dynamic_array( char, buf )
+ ),
+
+ F_printk("%08lx %s",
+ __entry->ip, __entry->buf)
+);
+
+FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
+
+ TRACE_MMIO_RW,
+
+ F_STRUCT(
+ __field_struct( struct mmiotrace_rw, rw )
+ __field_desc( resource_size_t, rw, phys )
+ __field_desc( unsigned long, rw, value )
+ __field_desc( unsigned long, rw, pc )
+ __field_desc( int, rw, map_id )
+ __field_desc( unsigned char, rw, opcode )
+ __field_desc( unsigned char, rw, width )
+ ),
+
+ F_printk("%lx %lx %lx %d %x %x",
+ (unsigned long)__entry->phys, __entry->value, __entry->pc,
+ __entry->map_id, __entry->opcode, __entry->width)
+);
+
+FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
+
+ TRACE_MMIO_MAP,
+
+ F_STRUCT(
+ __field_struct( struct mmiotrace_map, map )
+ __field_desc( resource_size_t, map, phys )
+ __field_desc( unsigned long, map, virt )
+ __field_desc( unsigned long, map, len )
+ __field_desc( int, map, map_id )
+ __field_desc( unsigned char, map, opcode )
+ ),
+
+ F_printk("%lx %lx %lx %d %x",
+ (unsigned long)__entry->phys, __entry->virt, __entry->len,
+ __entry->map_id, __entry->opcode)
+);
+
+
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+
+FTRACE_ENTRY(branch, trace_branch,
+
+ TRACE_BRANCH,
+
+ F_STRUCT(
+ __field( unsigned int, line )
+ __array( char, func, TRACE_FUNC_SIZE+1 )
+ __array( char, file, TRACE_FILE_SIZE+1 )
+ __field( char, correct )
+ ),
+
+ F_printk("%u:%s:%s (%u)",
+ __entry->line,
+ __entry->func, __entry->file, __entry->correct)
+);
+
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 00000000000..19a359d5e6d
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,216 @@
+/*
+ * trace event based perf event profiling/tracing
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include "trace.h"
+
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
+
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+ perf_trace_t;
+
+/* Count the events in use (per event id, not per instance) */
+static int total_ref_count;
+
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ /* No tracing, just counting, so no obvious leak */
+ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+ return 0;
+
+ /* Some events are ok to be traced by non-root users... */
+ if (p_event->attach_state == PERF_ATTACH_TASK) {
+ if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+ return 0;
+ }
+
+ /*
+ * ...otherwise raw tracepoint data can be a severe data leak,
+ * only allow root to have these.
+ */
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return 0;
+}
+
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ struct hlist_head __percpu *list;
+ int ret;
+ int cpu;
+
+ ret = perf_trace_event_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+
+ p_event->tp_event = tp_event;
+ if (tp_event->perf_refcount++ > 0)
+ return 0;
+
+ ret = -ENOMEM;
+
+ list = alloc_percpu(struct hlist_head);
+ if (!list)
+ goto fail;
+
+ for_each_possible_cpu(cpu)
+ INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
+
+ tp_event->perf_events = list;
+
+ if (!total_ref_count) {
+ char __percpu *buf;
+ int i;
+
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ buf = (char __percpu *)alloc_percpu(perf_trace_t);
+ if (!buf)
+ goto fail;
+
+ perf_trace_buf[i] = buf;
+ }
+ }
+
+ ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+ if (ret)
+ goto fail;
+
+ total_ref_count++;
+ return 0;
+
+fail:
+ if (!total_ref_count) {
+ int i;
+
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+
+ if (!--tp_event->perf_refcount) {
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+ }
+
+ return ret;
+}
+
+int perf_trace_init(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event;
+ int event_id = p_event->attr.config;
+ int ret = -EINVAL;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (tp_event->event.type == event_id &&
+ tp_event->class && tp_event->class->reg &&
+ try_module_get(tp_event->mod)) {
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ module_put(tp_event->mod);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+int perf_trace_add(struct perf_event *p_event, int flags)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;
+
+ if (!(flags & PERF_EF_START))
+ p_event->hw.state = PERF_HES_STOPPED;
+
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+
+ return 0;
+}
+
+void perf_trace_del(struct perf_event *p_event, int flags)
+{
+ hlist_del_rcu(&p_event->hlist_entry);
+}
+
+void perf_trace_destroy(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ int i;
+
+ mutex_lock(&event_mutex);
+ if (--tp_event->perf_refcount > 0)
+ goto out;
+
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
+
+ /*
+ * Ensure our callback won't be called anymore. The buffers
+ * will be freed after that.
+ */
+ tracepoint_synchronize_unregister();
+
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+
+ if (!--total_ref_count) {
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+out:
+ module_put(tp_event->mod);
+ mutex_unlock(&event_mutex);
+}
+
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+ struct pt_regs *regs, int *rctxp)
+{
+ struct trace_entry *entry;
+ unsigned long flags;
+ char *raw_data;
+ int pc;
+
+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+
+ pc = preempt_count();
+
+ *rctxp = perf_swevent_get_recursion_context();
+ if (*rctxp < 0)
+ return NULL;
+
+ raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+
+ entry = (struct trace_entry *)raw_data;
+ local_save_flags(flags);
+ tracing_generic_entry_update(entry, flags, pc);
+ entry->type = type;
+
+ return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index 5b5895afecf..00000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * trace event based perf counter profiling
- *
- * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
- */
-
-#include "trace.h"
-
-int ftrace_profile_enable(int event_id)
-{
- struct ftrace_event_call *event;
- int ret = -EINVAL;
-
- mutex_lock(&event_mutex);
- list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id) {
- ret = event->profile_enable(event);
- break;
- }
- }
- mutex_unlock(&event_mutex);
-
- return ret;
-}
-
-void ftrace_profile_disable(int event_id)
-{
- struct ftrace_event_call *event;
-
- mutex_lock(&event_mutex);
- list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id) {
- event->profile_disable(event);
- break;
- }
- }
- mutex_unlock(&event_mutex);
-}
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 5e32e375134..00000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM ftrace
-
-/*
- * We cheat and use the proto type field as the ID
- * and args as the entry type (minus 'struct')
- */
-TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, ip, ip)
- TRACE_FIELD(unsigned long, parent_ip, parent_ip)
- ),
- TP_RAW_FMT(" %lx <-- %lx")
-);
-
-TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
- ftrace_graph_ent_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, graph_ent.func, func)
- TRACE_FIELD(int, graph_ent.depth, depth)
- ),
- TP_RAW_FMT("--> %lx (%d)")
-);
-
-TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
- ftrace_graph_ret_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, ret.func, func)
- TRACE_FIELD(int, ret.depth, depth)
- ),
- TP_RAW_FMT("<-- %lx (%d)")
-);
-
-TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned int, prev_pid, prev_pid)
- TRACE_FIELD(unsigned char, prev_prio, prev_prio)
- TRACE_FIELD(unsigned char, prev_state, prev_state)
- TRACE_FIELD(unsigned int, next_pid, next_pid)
- TRACE_FIELD(unsigned char, next_prio, next_prio)
- TRACE_FIELD(unsigned char, next_state, next_state)
- TRACE_FIELD(unsigned int, next_cpu, next_cpu)
- ),
- TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
-);
-
-TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned int, prev_pid, prev_pid)
- TRACE_FIELD(unsigned char, prev_prio, prev_prio)
- TRACE_FIELD(unsigned char, prev_state, prev_state)
- TRACE_FIELD(unsigned int, next_pid, next_pid)
- TRACE_FIELD(unsigned char, next_prio, next_prio)
- TRACE_FIELD(unsigned char, next_state, next_state)
- TRACE_FIELD(unsigned int, next_cpu, next_cpu)
- ),
- TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
-);
-
-TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, arg1, arg1)
- TRACE_FIELD(unsigned long, arg2, arg2)
- TRACE_FIELD(unsigned long, arg3, arg3)
- ),
- TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
-);
-
-/*
- * Stack-trace entry:
- */
-
-/* #define FTRACE_STACK_ENTRIES 8 */
-
-TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, caller[0], stack0)
- TRACE_FIELD(unsigned long, caller[1], stack1)
- TRACE_FIELD(unsigned long, caller[2], stack2)
- TRACE_FIELD(unsigned long, caller[3], stack3)
- TRACE_FIELD(unsigned long, caller[4], stack4)
- TRACE_FIELD(unsigned long, caller[5], stack5)
- TRACE_FIELD(unsigned long, caller[6], stack6)
- TRACE_FIELD(unsigned long, caller[7], stack7)
- ),
- TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
- "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
-);
-
-TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, caller[0], stack0)
- TRACE_FIELD(unsigned long, caller[1], stack1)
- TRACE_FIELD(unsigned long, caller[2], stack2)
- TRACE_FIELD(unsigned long, caller[3], stack3)
- TRACE_FIELD(unsigned long, caller[4], stack4)
- TRACE_FIELD(unsigned long, caller[5], stack5)
- TRACE_FIELD(unsigned long, caller[6], stack6)
- TRACE_FIELD(unsigned long, caller[7], stack7)
- ),
- TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
- "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
-);
-
-TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, ip, ip)
- TRACE_FIELD(char *, fmt, fmt)
- TRACE_FIELD_ZERO_CHAR(buf)
- ),
- TP_RAW_FMT("%08lx (%d) fmt:%p %s")
-);
-
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned long, ip, ip)
- TRACE_FIELD_ZERO_CHAR(buf)
- ),
- TP_RAW_FMT("%08lx (%d) fmt:%p %s")
-);
-
-TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(unsigned int, line, line)
- TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
- TRACE_FUNC_SIZE+1, func)
- TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
- TRACE_FUNC_SIZE+1, file)
- TRACE_FIELD(char, correct, correct)
- ),
- TP_RAW_FMT("%u:%s:%s (%u)")
-);
-
-TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(u64, from, from)
- TRACE_FIELD(u64, to, to)
- ),
- TP_RAW_FMT("from: %llx to: %llx")
-);
-
-TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
- TRACE_STRUCT(
- TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
- TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
- TRACE_FIELD(int, state_data.type, type)
- TRACE_FIELD(int, state_data.state, state)
- ),
- TP_RAW_FMT("%llx->%llx type:%u state:%u")
-);
-
-TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
- TRACE_FIELD(unsigned long, call_site, call_site)
- TRACE_FIELD(const void *, ptr, ptr)
- TRACE_FIELD(size_t, bytes_req, bytes_req)
- TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
- TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
- TRACE_FIELD(int, node, node)
- ),
- TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
- " flags:%x node:%d")
-);
-
-TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
- TRACE_STRUCT(
- TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
- TRACE_FIELD(unsigned long, call_site, call_site)
- TRACE_FIELD(const void *, ptr, ptr)
- ),
- TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
-);
-
-#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b..35fde09b81d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,18 +15,38 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
+#include <linux/slab.h>
#include <linux/delay.h>
+#include <asm/setup.h>
+
#include "trace_output.h"
+#undef TRACE_SYSTEM
#define TRACE_SYSTEM "TRACE_SYSTEM"
DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
+
LIST_HEAD(ftrace_events);
+LIST_HEAD(ftrace_common_fields);
-int trace_define_field(struct ftrace_event_call *call, char *type,
- char *name, int offset, int size, int is_signed)
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+ if (!event_call->class->get_fields)
+ return &event_call->class->fields;
+ return event_call->class->get_fields(event_call);
+}
+
+static int __trace_define_field(struct list_head *head, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
{
struct ftrace_event_field *field;
@@ -42,31 +62,72 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
if (!field->type)
goto err;
+ if (filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(type);
+ else
+ field->filter_type = filter_type;
+
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
- list_add(&field->link, &call->fields);
+
+ list_add(&field->link, head);
return 0;
err:
- if (field) {
+ if (field)
kfree(field->name);
- kfree(field->type);
- }
kfree(field);
return -ENOMEM;
}
+
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type);
+}
EXPORT_SYMBOL_GPL(trace_define_field);
-#ifdef CONFIG_MODULES
+#define __common_field(type, item) \
+ ret = __trace_define_field(&ftrace_common_fields, #type, \
+ "common_" #item, \
+ offsetof(typeof(ent), item), \
+ sizeof(ent.item), \
+ is_signed_type(type), FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+static int trace_define_common_fields(void)
+{
+ int ret;
+ struct trace_entry ent;
+
+ __common_field(unsigned short, type);
+ __common_field(unsigned char, flags);
+ __common_field(unsigned char, preempt_count);
+ __common_field(int, pid);
+ __common_field(int, lock_depth);
+
+ return ret;
+}
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
+ struct list_head *head;
- list_for_each_entry_safe(field, next, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry_safe(field, next, head, link) {
list_del(&field->link);
kfree(field->type);
kfree(field->name);
@@ -74,27 +135,102 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
}
}
-#endif /* CONFIG_MODULES */
+int trace_event_raw_init(struct ftrace_event_call *call)
+{
+ int id;
+
+ id = register_ftrace_event(&call->event);
+ if (!id)
+ return -ENODEV;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(trace_event_raw_init);
+
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return tracepoint_probe_register(call->name,
+ call->class->probe,
+ call);
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->name,
+ call->class->probe,
+ call);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return tracepoint_probe_register(call->name,
+ call->class->perf_probe,
+ call);
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->name,
+ call->class->perf_probe,
+ call);
+ return 0;
+#endif
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
+void trace_event_enable_cmd_record(bool enable)
+{
+ struct ftrace_event_call *call;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+ continue;
+
+ if (enable) {
+ tracing_start_cmdline_record();
+ call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ } else {
+ tracing_stop_cmdline_record();
+ call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ }
+ }
+ mutex_unlock(&event_mutex);
+}
-static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+static int ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
+ int ret = 0;
+
switch (enable) {
case 0:
- if (call->enabled) {
- call->enabled = 0;
- tracing_stop_cmdline_record();
- call->unregfunc();
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
+ call->flags &= ~TRACE_EVENT_FL_ENABLED;
+ if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+ tracing_stop_cmdline_record();
+ call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ }
+ call->class->reg(call, TRACE_REG_UNREGISTER);
}
break;
case 1:
- if (!call->enabled) {
- call->enabled = 1;
- tracing_start_cmdline_record();
- call->regfunc();
+ if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+ if (trace_flags & TRACE_ITER_RECORD_CMD) {
+ tracing_start_cmdline_record();
+ call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ }
+ ret = call->class->reg(call, TRACE_REG_REGISTER);
+ if (ret) {
+ tracing_stop_cmdline_record();
+ pr_info("event trace: Could not enable event "
+ "%s\n", call->name);
+ break;
+ }
+ call->flags |= TRACE_EVENT_FL_ENABLED;
}
break;
}
+
+ return ret;
}
static void ftrace_clear_events(void)
@@ -120,15 +256,15 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class || !call->class->reg)
continue;
if (match &&
strcmp(match, call->name) != 0 &&
- strcmp(match, call->system) != 0)
+ strcmp(match, call->class->system) != 0)
continue;
- if (sub && strcmp(sub, call->system) != 0)
+ if (sub && strcmp(sub, call->class->system) != 0)
continue;
if (event && strcmp(event, call->name) != 0)
@@ -198,73 +334,38 @@ static ssize_t
ftrace_event_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- size_t read = 0;
- int i, set = 1;
- ssize_t ret;
- char *buf;
- char ch;
+ struct trace_parser parser;
+ ssize_t read, ret;
- if (!cnt || cnt < 0)
+ if (!cnt)
return 0;
ret = tracing_update_buffers();
if (ret < 0)
return ret;
- ret = get_user(ch, ubuf++);
- if (ret)
- return ret;
- read++;
- cnt--;
-
- /* skip white space */
- while (cnt && isspace(ch)) {
- ret = get_user(ch, ubuf++);
- if (ret)
- return ret;
- read++;
- cnt--;
- }
-
- /* Only white space found? */
- if (isspace(ch)) {
- file->f_pos += read;
- ret = read;
- return ret;
- }
-
- buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
- if (!buf)
+ if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
return -ENOMEM;
- if (cnt > EVENT_BUF_SIZE)
- cnt = EVENT_BUF_SIZE;
+ read = trace_get_user(&parser, ubuf, cnt, ppos);
- i = 0;
- while (cnt && !isspace(ch)) {
- if (!i && ch == '!')
+ if (read >= 0 && trace_parser_loaded((&parser))) {
+ int set = 1;
+
+ if (*parser.buffer == '!')
set = 0;
- else
- buf[i++] = ch;
- ret = get_user(ch, ubuf++);
+ parser.buffer[parser.idx] = 0;
+
+ ret = ftrace_set_clr_event(parser.buffer + !set, set);
if (ret)
- goto out_free;
- read++;
- cnt--;
+ goto out_put;
}
- buf[i] = 0;
-
- file->f_pos += read;
-
- ret = ftrace_set_clr_event(buf, set);
- if (ret)
- goto out_free;
ret = read;
- out_free:
- kfree(buf);
+ out_put:
+ trace_parser_put(&parser);
return ret;
}
@@ -272,78 +373,75 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct list_head *list = m->private;
- struct ftrace_event_call *call;
+ struct ftrace_event_call *call = v;
(*pos)++;
- for (;;) {
- if (list == &ftrace_events)
- return NULL;
-
- call = list_entry(list, struct ftrace_event_call, list);
-
+ list_for_each_entry_continue(call, &ftrace_events, list) {
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->regfunc)
- break;
-
- list = list->next;
+ if (call->class && call->class->reg)
+ return call;
}
- m->private = list->next;
-
- return call;
+ return NULL;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
+ struct ftrace_event_call *call;
+ loff_t l;
+
mutex_lock(&event_mutex);
- if (*pos == 0)
- m->private = ftrace_events.next;
- return t_next(m, NULL, pos);
+
+ call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+ for (l = 0; l <= *pos; ) {
+ call = t_next(m, call, &l);
+ if (!call)
+ break;
+ }
+ return call;
}
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct list_head *list = m->private;
- struct ftrace_event_call *call;
+ struct ftrace_event_call *call = v;
(*pos)++;
- retry:
- if (list == &ftrace_events)
- return NULL;
-
- call = list_entry(list, struct ftrace_event_call, list);
-
- if (!call->enabled) {
- list = list->next;
- goto retry;
+ list_for_each_entry_continue(call, &ftrace_events, list) {
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
+ return call;
}
- m->private = list->next;
-
- return call;
+ return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
+ struct ftrace_event_call *call;
+ loff_t l;
+
mutex_lock(&event_mutex);
- if (*pos == 0)
- m->private = ftrace_events.next;
- return s_next(m, NULL, pos);
+
+ call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+ for (l = 0; l <= *pos; ) {
+ call = s_next(m, call, &l);
+ if (!call)
+ break;
+ }
+ return call;
}
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_event_call *call = v;
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- seq_printf(m, "%s:", call->system);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ seq_printf(m, "%s:", call->class->system);
seq_printf(m, "%s\n", call->name);
return 0;
@@ -360,7 +458,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
const struct seq_operations *seq_ops;
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_clear_events();
seq_ops = inode->i_private;
@@ -374,7 +472,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
struct ftrace_event_call *call = filp->private_data;
char *buf;
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
buf = "1\n";
else
buf = "0\n";
@@ -411,7 +509,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
case 0:
case 1:
mutex_lock(&event_mutex);
- ftrace_event_enable_disable(call, val);
+ ret = ftrace_event_enable_disable(call, val);
mutex_unlock(&event_mutex);
break;
@@ -421,7 +519,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
- return cnt;
+ return ret ? ret : cnt;
}
static ssize_t
@@ -437,10 +535,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class || !call->class->reg)
continue;
- if (system && strcmp(call->system, system) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -448,7 +546,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!call->enabled);
+ set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -506,74 +604,146 @@ out:
return ret;
}
-extern char *__bad_type_size(void);
+enum {
+ FORMAT_HEADER = 1,
+ FORMAT_FIELD_SEPERATOR = 2,
+ FORMAT_PRINTFMT = 3,
+};
+
+static void *f_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_event_call *call = m->private;
+ struct ftrace_event_field *field;
+ struct list_head *common_head = &ftrace_common_fields;
+ struct list_head *head = trace_get_fields(call);
+
+ (*pos)++;
+
+ switch ((unsigned long)v) {
+ case FORMAT_HEADER:
+ if (unlikely(list_empty(common_head)))
+ return NULL;
+
+ field = list_entry(common_head->prev,
+ struct ftrace_event_field, link);
+ return field;
+
+ case FORMAT_FIELD_SEPERATOR:
+ if (unlikely(list_empty(head)))
+ return NULL;
+
+ field = list_entry(head->prev, struct ftrace_event_field, link);
+ return field;
-#undef FIELD
-#define FIELD(type, name) \
- sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
- #type, "common_" #name, offsetof(typeof(field), name), \
- sizeof(field.name)
+ case FORMAT_PRINTFMT:
+ /* all done */
+ return NULL;
+ }
+
+ field = v;
+ if (field->link.prev == common_head)
+ return (void *)FORMAT_FIELD_SEPERATOR;
+ else if (field->link.prev == head)
+ return (void *)FORMAT_PRINTFMT;
-static int trace_write_header(struct trace_seq *s)
+ field = list_entry(field->link.prev, struct ftrace_event_field, link);
+
+ return field;
+}
+
+static void *f_start(struct seq_file *m, loff_t *pos)
{
- struct trace_entry field;
-
- /* struct trace_entry */
- return trace_seq_printf(s,
- "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
- "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
- "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
- "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
- "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
- "\n",
- FIELD(unsigned short, type),
- FIELD(unsigned char, flags),
- FIELD(unsigned char, preempt_count),
- FIELD(int, pid),
- FIELD(int, tgid));
+ loff_t l = 0;
+ void *p;
+
+ /* Start by showing the header */
+ if (!*pos)
+ return (void *)FORMAT_HEADER;
+
+ p = (void *)FORMAT_HEADER;
+ do {
+ p = f_next(m, p, &l);
+ } while (p && l < *pos);
+
+ return p;
}
-static ssize_t
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
- loff_t *ppos)
+static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = filp->private_data;
- struct trace_seq *s;
- char *buf;
- int r;
+ struct ftrace_event_call *call = m->private;
+ struct ftrace_event_field *field;
+ const char *array_descriptor;
- if (*ppos)
+ switch ((unsigned long)v) {
+ case FORMAT_HEADER:
+ seq_printf(m, "name: %s\n", call->name);
+ seq_printf(m, "ID: %d\n", call->event.type);
+ seq_printf(m, "format:\n");
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
+ case FORMAT_FIELD_SEPERATOR:
+ seq_putc(m, '\n');
+ return 0;
- trace_seq_init(s);
+ case FORMAT_PRINTFMT:
+ seq_printf(m, "\nprint fmt: %s\n",
+ call->print_fmt);
+ return 0;
+ }
- /* If any of the first writes fail, so will the show_format. */
+ field = v;
- trace_seq_printf(s, "name: %s\n", call->name);
- trace_seq_printf(s, "ID: %d\n", call->id);
- trace_seq_printf(s, "format:\n");
- trace_write_header(s);
+ /*
+ * Smartly shows the array type(except dynamic array).
+ * Normal:
+ * field:TYPE VAR
+ * If TYPE := TYPE[LEN], it is shown:
+ * field:TYPE VAR[LEN]
+ */
+ array_descriptor = strchr(field->type, '[');
- r = call->show_format(s);
- if (!r) {
- /*
- * ug! The format output is bigger than a PAGE!!
- */
- buf = "FORMAT TOO BIG\n";
- r = simple_read_from_buffer(ubuf, cnt, ppos,
- buf, strlen(buf));
- goto out;
- }
+ if (!strncmp(field->type, "__data_loc", 10))
+ array_descriptor = NULL;
- r = simple_read_from_buffer(ubuf, cnt, ppos,
- s->buffer, s->len);
- out:
- kfree(s);
- return r;
+ if (!array_descriptor)
+ seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, field->offset,
+ field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ array_descriptor, field->offset,
+ field->size, !!field->is_signed);
+
+ return 0;
+}
+
+static void f_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations trace_format_seq_ops = {
+ .start = f_start,
+ .next = f_next,
+ .stop = f_stop,
+ .show = f_show,
+};
+
+static int trace_format_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_event_call *call = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &trace_format_seq_ops);
+ if (ret < 0)
+ return ret;
+
+ m = file->private_data;
+ m->private = call;
+
+ return 0;
}
static ssize_t
@@ -591,7 +761,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return -ENOMEM;
trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->id);
+ trace_seq_printf(s, "%d\n", call->event.type);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
@@ -768,39 +938,47 @@ static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_event_format_fops = {
- .open = tracing_open_generic,
- .read = event_format_read,
+ .open = trace_format_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
static const struct file_operations ftrace_event_id_fops = {
.open = tracing_open_generic,
.read = event_id_read,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_generic,
.read = event_filter_read,
.write = event_filter_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_subsystem_filter_fops = {
.open = tracing_open_generic,
.read = subsystem_filter_read,
.write = subsystem_filter_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_system_enable_fops = {
.open = tracing_open_generic,
.read = system_enable_read,
.write = system_enable_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_show_header_fops = {
.open = tracing_open_generic,
.read = show_header,
+ .llseek = default_llseek,
};
static struct dentry *event_trace_events_dir(void)
@@ -833,8 +1011,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0)
+ if (strcmp(system->name, name) == 0) {
+ system->nr_events++;
return system->entry;
+ }
}
/* need to create new entry */
@@ -853,6 +1033,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
return d_events;
}
+ system->nr_events = 1;
system->name = kstrdup(name, GFP_KERNEL);
if (!system->name) {
debugfs_remove(system->entry);
@@ -880,9 +1061,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
"'%s/filter' entry\n", name);
}
- entry = trace_create_file("enable", 0644, system->entry,
- (void *)system->name,
- &ftrace_system_enable_fops);
+ trace_create_file("enable", 0644, system->entry,
+ (void *)system->name,
+ &ftrace_system_enable_fops);
return system->entry;
}
@@ -894,24 +1075,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
const struct file_operations *filter,
const struct file_operations *format)
{
- struct dentry *entry;
+ struct list_head *head;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- d_events = event_subsystem_dir(call->system, d_events);
-
- if (call->raw_init) {
- ret = call->raw_init();
- if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
- return ret;
- }
- }
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ d_events = event_subsystem_dir(call->class->system, d_events);
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
@@ -920,35 +1092,138 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return -1;
}
- if (call->regfunc)
- entry = trace_create_file("enable", 0644, call->dir, call,
- enable);
+ if (call->class->reg)
+ trace_create_file("enable", 0644, call->dir, call,
+ enable);
- if (call->id)
- entry = trace_create_file("id", 0444, call->dir, call,
- id);
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && call->class->reg)
+ trace_create_file("id", 0444, call->dir, call,
+ id);
+#endif
- if (call->define_fields) {
- ret = call->define_fields();
+ /*
+ * Other events may have the same class. Only update
+ * the fields if they are not already defined.
+ */
+ head = trace_get_fields(call);
+ if (list_empty(head)) {
+ ret = call->class->define_fields(call);
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
return ret;
}
- entry = trace_create_file("filter", 0644, call->dir, call,
- filter);
}
+ trace_create_file("filter", 0644, call->dir, call,
+ filter);
- /* A trace may not want to export its format */
- if (!call->show_format)
- return 0;
-
- entry = trace_create_file("format", 0444, call->dir, call,
- format);
+ trace_create_file("format", 0444, call->dir, call,
+ format);
return 0;
}
+static int
+__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
+ const struct file_operations *id,
+ const struct file_operations *enable,
+ const struct file_operations *filter,
+ const struct file_operations *format)
+{
+ struct dentry *d_events;
+ int ret;
+
+ /* The linker may leave blanks */
+ if (!call->name)
+ return -EINVAL;
+
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
+ if (ret < 0) {
+ if (ret != -ENOSYS)
+ pr_warning("Could not initialize trace events/%s\n",
+ call->name);
+ return ret;
+ }
+ }
+
+ d_events = event_trace_events_dir();
+ if (!d_events)
+ return -ENOENT;
+
+ ret = event_create_dir(call, d_events, id, enable, filter, format);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
+ call->mod = mod;
+
+ return ret;
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+ int ret;
+ mutex_lock(&event_mutex);
+ ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ mutex_unlock(&event_mutex);
+ return ret;
+}
+
+static void remove_subsystem_dir(const char *name)
+{
+ struct event_subsystem *system;
+
+ if (strcmp(name, TRACE_SYSTEM) == 0)
+ return;
+
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0) {
+ if (!--system->nr_events) {
+ struct event_filter *filter = system->filter;
+
+ debugfs_remove_recursive(system->entry);
+ list_del(&system->list);
+ if (filter) {
+ kfree(filter->filter_string);
+ kfree(filter);
+ }
+ kfree(system->name);
+ kfree(system);
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+ ftrace_event_enable_disable(call, 0);
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
+ debugfs_remove_recursive(call->dir);
+ list_del(&call->list);
+ trace_destroy_fields(call);
+ destroy_preds(call);
+ remove_subsystem_dir(call->class->system);
+}
+
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+ mutex_lock(&event_mutex);
+ down_write(&trace_event_mutex);
+ __trace_remove_event_call(call);
+ up_write(&trace_event_mutex);
+ mutex_unlock(&event_mutex);
+}
+
#define for_each_event(event, start, end) \
for (event = start; \
(unsigned long)event < (unsigned long)end; \
@@ -1010,7 +1285,6 @@ static void trace_module_add_events(struct module *mod)
{
struct ftrace_module_file_ops *file_ops = NULL;
struct ftrace_event_call *call, *start, *end;
- struct dentry *d_events;
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
@@ -1018,29 +1292,14 @@ static void trace_module_add_events(struct module *mod)
if (start == end)
return;
- d_events = event_trace_events_dir();
- if (!d_events)
+ file_ops = trace_create_file_ops(mod);
+ if (!file_ops)
return;
for_each_event(call, start, end) {
- /* The linker may leave blanks */
- if (!call->name)
- continue;
-
- /*
- * This module has events, create file ops for this module
- * if not already done.
- */
- if (!file_ops) {
- file_ops = trace_create_file_ops(mod);
- if (!file_ops)
- return;
- }
- call->mod = mod;
- list_add(&call->list, &ftrace_events);
- event_create_dir(call, d_events,
- &file_ops->id, &file_ops->enable,
- &file_ops->filter, &file_ops->format);
+ __trace_add_event_call(call, mod,
+ &file_ops->id, &file_ops->enable,
+ &file_ops->filter, &file_ops->format);
}
}
@@ -1054,13 +1313,7 @@ static void trace_module_remove_events(struct module *mod)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
if (call->mod == mod) {
found = true;
- ftrace_event_enable_disable(call, 0);
- if (call->event)
- __unregister_ftrace_event(call->event);
- debugfs_remove_recursive(call->dir);
- list_del(&call->list);
- trace_destroy_fields(call);
- destroy_preds(call);
+ __trace_remove_event_call(call);
}
}
@@ -1109,7 +1362,7 @@ static int trace_module_notify(struct notifier_block *self,
}
#endif /* CONFIG_MODULES */
-struct notifier_block trace_module_nb = {
+static struct notifier_block trace_module_nb = {
.notifier_call = trace_module_notify,
.priority = 0,
};
@@ -1117,6 +1370,18 @@ struct notifier_block trace_module_nb = {
extern struct ftrace_event_call __start_ftrace_events[];
extern struct ftrace_event_call __stop_ftrace_events[];
+static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
+
+static __init int setup_trace_event(char *str)
+{
+ strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = 1;
+ tracing_selftest_disabled = 1;
+
+ return 1;
+}
+__setup("trace_event=", setup_trace_event);
+
static __init int event_trace_init(void)
{
struct ftrace_event_call *call;
@@ -1124,6 +1389,8 @@ static __init int event_trace_init(void)
struct dentry *entry;
struct dentry *d_events;
int ret;
+ char *buf = bootup_event_buf;
+ char *token;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -1159,14 +1426,27 @@ static __init int event_trace_init(void)
trace_create_file("enable", 0644, d_events,
NULL, &ftrace_system_enable_fops);
+ if (trace_define_common_fields())
+ pr_warning("tracing: Failed to allocate common fields");
+
for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
- /* The linker may leave blanks */
- if (!call->name)
+ __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ }
+
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
continue;
- list_add(&call->list, &ftrace_events);
- event_create_dir(call, d_events, &ftrace_event_id_fops,
- &ftrace_enable_fops, &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
+
+ ret = ftrace_set_clr_event(token, 1);
+ if (ret)
+ pr_warning("Failed to enable trace event: %s\n", token);
}
ret = register_module_notifier(&trace_module_nb);
@@ -1241,17 +1521,29 @@ static __init void event_trace_self_tests(void)
list_for_each_entry(call, &ftrace_events, list) {
- /* Only test those that have a regfunc */
- if (!call->regfunc)
+ /* Only test those that have a probe */
+ if (!call->class || !call->class->probe)
continue;
+/*
+ * Testing syscall events here is pretty useless, but
+ * we still do it if configured. But this is time consuming.
+ * What we really need is a user thread to perform the
+ * syscalls as we test.
+ */
+#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
+ if (call->class->system &&
+ strcmp(call->class->system, "syscalls") == 0)
+ continue;
+#endif
+
pr_info("Testing event %s: ", call->name);
/*
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (call->enabled) {
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
pr_warning("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
@@ -1318,30 +1610,31 @@ static __init void event_trace_self_tests(void)
#ifdef CONFIG_FUNCTION_TRACER
-static DEFINE_PER_CPU(atomic_t, test_event_disable);
+static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
static void
function_test_events_call(unsigned long ip, unsigned long parent_ip)
{
struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
struct ftrace_entry *entry;
unsigned long flags;
long disabled;
- int resched;
int cpu;
int pc;
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+ disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
if (disabled != 1)
goto out;
local_save_flags(flags);
- event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+ event = trace_current_buffer_lock_reserve(&buffer,
+ TRACE_FN, sizeof(*entry),
flags, pc);
if (!event)
goto out;
@@ -1349,11 +1642,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
entry->ip = ip;
entry->parent_ip = parent_ip;
- trace_nowake_buffer_unlock_commit(event, flags, pc);
+ trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
out:
- atomic_dec(&per_cpu(test_event_disable, cpu));
- ftrace_preempt_enable(resched);
+ atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __initdata =
@@ -1376,10 +1669,10 @@ static __init void event_trace_self_test_with_function(void)
static __init int event_trace_self_tests_init(void)
{
-
- event_trace_self_tests();
-
- event_trace_self_test_with_function();
+ if (!tracing_selftest_disabled) {
+ event_trace_self_tests();
+ event_trace_self_test_with_function();
+ }
return 0;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621bbf4..36d40104b17 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,11 @@
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
+#include <linux/perf_event.h>
+#include <linux/slab.h>
#include "trace.h"
#include "trace_output.h"
@@ -31,6 +31,7 @@ enum filter_op_ids
{
OP_OR,
OP_AND,
+ OP_GLOB,
OP_NE,
OP_EQ,
OP_LT,
@@ -48,16 +49,17 @@ struct filter_op {
};
static struct filter_op filter_ops[] = {
- { OP_OR, "||", 1 },
- { OP_AND, "&&", 2 },
- { OP_NE, "!=", 4 },
- { OP_EQ, "==", 4 },
- { OP_LT, "<", 5 },
- { OP_LE, "<=", 5 },
- { OP_GT, ">", 5 },
- { OP_GE, ">=", 5 },
- { OP_NONE, "OP_NONE", 0 },
- { OP_OPEN_PAREN, "(", 0 },
+ { OP_OR, "||", 1 },
+ { OP_AND, "&&", 2 },
+ { OP_GLOB, "~", 4 },
+ { OP_NE, "!=", 4 },
+ { OP_EQ, "==", 4 },
+ { OP_LT, "<", 5 },
+ { OP_LE, "<=", 5 },
+ { OP_GT, ">", 5 },
+ { OP_GE, ">=", 5 },
+ { OP_NONE, "OP_NONE", 0 },
+ { OP_OPEN_PAREN, "(", 0 },
};
enum {
@@ -121,6 +123,47 @@ struct filter_parse_state {
} operand;
};
+#define DEFINE_COMPARISON_PRED(type) \
+static int filter_pred_##type(struct filter_pred *pred, void *event, \
+ int val1, int val2) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = 0; \
+ \
+ switch (pred->op) { \
+ case OP_LT: \
+ match = (*addr < val); \
+ break; \
+ case OP_LE: \
+ match = (*addr <= val); \
+ break; \
+ case OP_GT: \
+ match = (*addr > val); \
+ break; \
+ case OP_GE: \
+ match = (*addr >= val); \
+ break; \
+ default: \
+ break; \
+ } \
+ \
+ return match; \
+}
+
+#define DEFINE_EQUALITY_PRED(size) \
+static int filter_pred_##size(struct filter_pred *pred, void *event, \
+ int val1, int val2) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ u##size val = (u##size)pred->val; \
+ int match; \
+ \
+ match = (val == *addr) ^ pred->not; \
+ \
+ return match; \
+}
+
DEFINE_COMPARISON_PRED(s64);
DEFINE_COMPARISON_PRED(u64);
DEFINE_COMPARISON_PRED(s32);
@@ -156,9 +199,24 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
char *addr = (char *)(event + pred->offset);
int cmp, match;
- cmp = strncmp(addr, pred->str_val, pred->str_len);
+ cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
- match = (!cmp) ^ pred->not;
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
+/* Filter predicate for char * pointers */
+static int filter_pred_pchar(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ char **addr = (char **)(event + pred->offset);
+ int cmp, match;
+ int len = strlen(*addr) + 1; /* including tailing '\0' */
+
+ cmp = pred->regex.match(*addr, &pred->regex, len);
+
+ match = cmp ^ pred->not;
return match;
}
@@ -176,13 +234,15 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
static int filter_pred_strloc(struct filter_pred *pred, void *event,
int val1, int val2)
{
- unsigned short str_loc = *(unsigned short *)(event + pred->offset);
+ u32 str_item = *(u32 *)(event + pred->offset);
+ int str_loc = str_item & 0xffff;
+ int str_len = str_item >> 16;
char *addr = (char *)(event + str_loc);
int cmp, match;
- cmp = strncmp(addr, pred->str_val, pred->str_len);
+ cmp = pred->regex.match(addr, &pred->regex, str_len);
- match = (!cmp) ^ pred->not;
+ match = cmp ^ pred->not;
return match;
}
@@ -193,10 +253,133 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
return 0;
}
+/*
+ * regex_match_foo - Basic regex callbacks
+ *
+ * @str: the string to be searched
+ * @r: the regex structure containing the pattern string
+ * @len: the length of the string to be searched (including '\0')
+ *
+ * Note:
+ * - @str might not be NULL-terminated if it's of type DYN_STRING
+ * or STATIC_STRING
+ */
+
+static int regex_match_full(char *str, struct regex *r, int len)
+{
+ if (strncmp(str, r->pattern, len) == 0)
+ return 1;
+ return 0;
+}
+
+static int regex_match_front(char *str, struct regex *r, int len)
+{
+ if (strncmp(str, r->pattern, r->len) == 0)
+ return 1;
+ return 0;
+}
+
+static int regex_match_middle(char *str, struct regex *r, int len)
+{
+ if (strnstr(str, r->pattern, len))
+ return 1;
+ return 0;
+}
+
+static int regex_match_end(char *str, struct regex *r, int len)
+{
+ int strlen = len - 1;
+
+ if (strlen >= r->len &&
+ memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
+ return 1;
+ return 0;
+}
+
+/**
+ * filter_parse_regex - parse a basic regex
+ * @buff: the raw regex
+ * @len: length of the regex
+ * @search: will point to the beginning of the string to compare
+ * @not: tell whether the match will have to be inverted
+ *
+ * This passes in a buffer containing a regex and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ * search returns the pointer to use for comparison.
+ * not returns 1 if buff started with a '!'
+ * 0 otherwise.
+ */
+enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
+{
+ int type = MATCH_FULL;
+ int i;
+
+ if (buff[0] == '!') {
+ *not = 1;
+ buff++;
+ len--;
+ } else
+ *not = 0;
+
+ *search = buff;
+
+ for (i = 0; i < len; i++) {
+ if (buff[i] == '*') {
+ if (!i) {
+ *search = buff + 1;
+ type = MATCH_END_ONLY;
+ } else {
+ if (type == MATCH_END_ONLY)
+ type = MATCH_MIDDLE_ONLY;
+ else
+ type = MATCH_FRONT_ONLY;
+ buff[i] = 0;
+ break;
+ }
+ }
+ }
+
+ return type;
+}
+
+static void filter_build_regex(struct filter_pred *pred)
+{
+ struct regex *r = &pred->regex;
+ char *search;
+ enum regex_type type = MATCH_FULL;
+ int not = 0;
+
+ if (pred->op == OP_GLOB) {
+ type = filter_parse_regex(r->pattern, r->len, &search, &not);
+ r->len = strlen(search);
+ memmove(r->pattern, search, r->len+1);
+ }
+
+ switch (type) {
+ case MATCH_FULL:
+ r->match = regex_match_full;
+ break;
+ case MATCH_FRONT_ONLY:
+ r->match = regex_match_front;
+ break;
+ case MATCH_MIDDLE_ONLY:
+ r->match = regex_match_middle;
+ break;
+ case MATCH_END_ONLY:
+ r->match = regex_match_end;
+ break;
+ }
+
+ pred->not ^= not;
+}
+
/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
{
- struct event_filter *filter = call->filter;
int match, top = 0, val1 = 0, val2 = 0;
int stack[MAX_FILTER_PRED];
struct filter_pred *pred;
@@ -293,7 +476,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
struct event_filter *filter = call->filter;
mutex_lock(&event_mutex);
- if (filter->filter_string)
+ if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
trace_seq_printf(s, "none\n");
@@ -306,7 +489,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
struct event_filter *filter = system->filter;
mutex_lock(&event_mutex);
- if (filter->filter_string)
+ if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
trace_seq_printf(s, "none\n");
@@ -314,11 +497,11 @@ void print_subsystem_event_filter(struct event_subsystem *system,
}
static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
+__find_event_field(struct list_head *head, char *name)
{
struct ftrace_event_field *field;
- list_for_each_entry(field, &call->fields, link) {
+ list_for_each_entry(field, head, link) {
if (!strcmp(field->name, name))
return field;
}
@@ -326,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name)
return NULL;
}
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ field = __find_event_field(&ftrace_common_fields, name);
+ if (field)
+ return field;
+
+ head = trace_get_fields(call);
+ return __find_event_field(head, name);
+}
+
static void filter_free_pred(struct filter_pred *pred)
{
if (!pred)
@@ -339,7 +536,7 @@ static void filter_clear_pred(struct filter_pred *pred)
{
kfree(pred->field_name);
pred->field_name = NULL;
- pred->str_len = 0;
+ pred->regex.len = 0;
}
static int filter_set_pred(struct filter_pred *dest,
@@ -362,18 +559,20 @@ static void filter_disable_preds(struct ftrace_event_call *call)
struct event_filter *filter = call->filter;
int i;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
filter->n_preds = 0;
for (i = 0; i < MAX_FILTER_PRED; i++)
filter->preds[i]->fn = filter_pred_none;
}
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
{
- struct event_filter *filter = call->filter;
int i;
+ if (!filter)
+ return;
+
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (filter->preds[i])
filter_free_pred(filter->preds[i]);
@@ -381,20 +580,25 @@ void destroy_preds(struct ftrace_event_call *call)
kfree(filter->preds);
kfree(filter->filter_string);
kfree(filter);
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+ __free_preds(call->filter);
call->filter = NULL;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
-int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
{
struct event_filter *filter;
struct filter_pred *pred;
int i;
- filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- if (!call->filter)
- return -ENOMEM;
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
- call->filter_active = 0;
filter->n_preds = 0;
filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -409,46 +613,62 @@ int init_preds(struct ftrace_event_call *call)
filter->preds[i] = pred;
}
- return 0;
+ return filter;
oom:
- destroy_preds(call);
+ __free_preds(filter);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int init_preds(struct ftrace_event_call *call)
+{
+ if (call->filter)
+ return 0;
+
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
+ call->filter = __alloc_preds();
+ if (IS_ERR(call->filter))
+ return PTR_ERR(call->filter);
- return -ENOMEM;
+ return 0;
}
-EXPORT_SYMBOL_GPL(init_preds);
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static int init_subsystem_preds(struct event_subsystem *system)
{
- struct event_filter *filter = system->filter;
struct ftrace_event_call *call;
- int i;
+ int err;
- if (filter->n_preds) {
- for (i = 0; i < filter->n_preds; i++)
- filter_free_pred(filter->preds[i]);
- kfree(filter->preds);
- filter->preds = NULL;
- filter->n_preds = 0;
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (strcmp(call->class->system, system->name) != 0)
+ continue;
+
+ err = init_preds(call);
+ if (err)
+ return err;
}
+ return 0;
+}
+
+static void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+ struct ftrace_event_call *call;
+
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
- if (!strcmp(call->system, system->name)) {
- filter_disable_preds(call);
- remove_filter_string(call->filter);
- }
+ filter_disable_preds(call);
+ remove_filter_string(call->filter);
}
}
static int filter_add_pred_fn(struct filter_parse_state *ps,
struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_pred *pred,
filter_pred_fn_t fn)
{
- struct event_filter *filter = call->filter;
int idx, err;
if (filter->n_preds == MAX_FILTER_PRED) {
@@ -463,17 +683,11 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
return err;
filter->n_preds++;
- call->filter_active = 1;
return 0;
}
-enum {
- FILTER_STATIC_STRING = 1,
- FILTER_DYN_STRING
-};
-
-static int is_string_field(const char *type)
+int filter_assign_type(const char *type)
{
if (strstr(type, "__data_loc") && strstr(type, "char"))
return FILTER_DYN_STRING;
@@ -481,12 +695,22 @@ static int is_string_field(const char *type)
if (strchr(type, '[') && strstr(type, "char"))
return FILTER_STATIC_STRING;
- return 0;
+ return FILTER_OTHER;
+}
+
+static bool is_string_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_STATIC_STRING ||
+ field->filter_type == FILTER_PTR_STRING;
}
static int is_legal_op(struct ftrace_event_field *field, int op)
{
- if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+ if (is_string_field(field) &&
+ (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+ return 0;
+ if (!is_string_field(field) && op == OP_GLOB)
return 0;
return 1;
@@ -537,22 +761,25 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
static int filter_add_pred(struct filter_parse_state *ps,
struct ftrace_event_call *call,
- struct filter_pred *pred)
+ struct event_filter *filter,
+ struct filter_pred *pred,
+ bool dry_run)
{
struct ftrace_event_field *field;
filter_pred_fn_t fn;
unsigned long long val;
- int string_type;
int ret;
pred->fn = filter_pred_none;
if (pred->op == OP_AND) {
pred->pop_n = 2;
- return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+ fn = filter_pred_and;
+ goto add_pred_fn;
} else if (pred->op == OP_OR) {
pred->pop_n = 2;
- return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+ fn = filter_pred_or;
+ goto add_pred_fn;
}
field = find_event_field(call, pred->field_name);
@@ -568,83 +795,42 @@ static int filter_add_pred(struct filter_parse_state *ps,
return -EINVAL;
}
- string_type = is_string_field(field->type);
- if (string_type) {
- if (string_type == FILTER_STATIC_STRING)
+ if (is_string_field(field)) {
+ filter_build_regex(pred);
+
+ if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
- else
+ pred->regex.field_len = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
fn = filter_pred_strloc;
- pred->str_len = field->size;
- if (pred->op == OP_NE)
- pred->not = 1;
- return filter_add_pred_fn(ps, call, pred, fn);
+ else
+ fn = filter_pred_pchar;
} else {
if (field->is_signed)
- ret = strict_strtoll(pred->str_val, 0, &val);
+ ret = strict_strtoll(pred->regex.pattern, 0, &val);
else
- ret = strict_strtoull(pred->str_val, 0, &val);
+ ret = strict_strtoull(pred->regex.pattern, 0, &val);
if (ret) {
parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
return -EINVAL;
}
pred->val = val;
- }
- fn = select_comparison_fn(pred->op, field->size, field->is_signed);
- if (!fn) {
- parse_error(ps, FILT_ERR_INVALID_OP, 0);
- return -EINVAL;
+ fn = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
+ if (!fn) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
}
if (pred->op == OP_NE)
pred->not = 1;
- return filter_add_pred_fn(ps, call, pred, fn);
-}
-
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
- struct event_subsystem *system,
- struct filter_pred *pred,
- char *filter_string)
-{
- struct event_filter *filter = system->filter;
- struct ftrace_event_call *call;
- int err = 0;
-
- if (!filter->preds) {
- filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
- GFP_KERNEL);
-
- if (!filter->preds)
- return -ENOMEM;
- }
-
- if (filter->n_preds == MAX_FILTER_PRED) {
- parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
- }
-
- filter->preds[filter->n_preds] = pred;
- filter->n_preds++;
-
- list_for_each_entry(call, &ftrace_events, list) {
-
- if (!call->define_fields)
- continue;
-
- if (strcmp(call->system, system->name))
- continue;
-
- err = filter_add_pred(ps, call, pred);
- if (err) {
- filter_free_subsystem_preds(system);
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- goto out;
- }
- replace_filter_string(call->filter, filter_string);
- }
-out:
- return err;
+add_pred_fn:
+ if (!dry_run)
+ return filter_add_pred_fn(ps, call, filter, pred, fn);
+ return 0;
}
static void parse_init(struct filter_parse_state *ps,
@@ -844,8 +1030,9 @@ static void postfix_clear(struct filter_parse_state *ps)
while (!list_empty(&ps->postfix)) {
elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
- kfree(elt->operand);
list_del(&elt->list);
+ kfree(elt->operand);
+ kfree(elt);
}
}
@@ -955,8 +1142,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
return NULL;
}
- strcpy(pred->str_val, operand2);
- pred->str_len = strlen(operand2);
+ strcpy(pred->regex.pattern, operand2);
+ pred->regex.len = strlen(pred->regex.pattern);
pred->op = op;
@@ -1000,15 +1187,17 @@ static int check_preds(struct filter_parse_state *ps)
return 0;
}
-static int replace_preds(struct event_subsystem *system,
- struct ftrace_event_call *call,
+static int replace_preds(struct ftrace_event_call *call,
+ struct event_filter *filter,
struct filter_parse_state *ps,
- char *filter_string)
+ char *filter_string,
+ bool dry_run)
{
char *operand1 = NULL, *operand2 = NULL;
struct filter_pred *pred;
struct postfix_elt *elt;
int err;
+ int n_preds = 0;
err = check_preds(ps);
if (err)
@@ -1027,19 +1216,14 @@ static int replace_preds(struct event_subsystem *system,
continue;
}
+ if (n_preds++ == MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
if (elt->op == OP_AND || elt->op == OP_OR) {
pred = create_logical_pred(elt->op);
- if (call) {
- err = filter_add_pred(ps, call, pred);
- filter_free_pred(pred);
- } else
- err = filter_add_subsystem_pred(ps, system,
- pred, filter_string);
- if (err)
- return err;
-
- operand1 = operand2 = NULL;
- continue;
+ goto add_pred;
}
if (!operand1 || !operand2) {
@@ -1048,12 +1232,11 @@ static int replace_preds(struct event_subsystem *system,
}
pred = create_pred(elt->op, operand1, operand2);
- if (call) {
- err = filter_add_pred(ps, call, pred);
- filter_free_pred(pred);
- } else
- err = filter_add_subsystem_pred(ps, system, pred,
- filter_string);
+add_pred:
+ if (!pred)
+ return -ENOMEM;
+ err = filter_add_pred(ps, call, filter, pred, dry_run);
+ filter_free_pred(pred);
if (err)
return err;
@@ -1063,19 +1246,59 @@ static int replace_preds(struct event_subsystem *system,
return 0;
}
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+static int replace_system_preds(struct event_subsystem *system,
+ struct filter_parse_state *ps,
+ char *filter_string)
{
+ struct ftrace_event_call *call;
+ bool fail = true;
int err;
+ list_for_each_entry(call, &ftrace_events, list) {
+ struct event_filter *filter = call->filter;
+
+ if (strcmp(call->class->system, system->name) != 0)
+ continue;
+
+ /* try to see if the filter can be applied */
+ err = replace_preds(call, filter, ps, filter_string, true);
+ if (err)
+ continue;
+
+ /* really apply the filter */
+ filter_disable_preds(call);
+ err = replace_preds(call, filter, ps, filter_string, false);
+ if (err)
+ filter_disable_preds(call);
+ else {
+ call->flags |= TRACE_EVENT_FL_FILTERED;
+ replace_filter_string(filter, filter_string);
+ }
+ fail = false;
+ }
+
+ if (fail) {
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+ int err;
struct filter_parse_state *ps;
mutex_lock(&event_mutex);
+ err = init_preds(call);
+ if (err)
+ goto out_unlock;
+
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable_preds(call);
remove_filter_string(call->filter);
- mutex_unlock(&event_mutex);
- return 0;
+ goto out_unlock;
}
err = -ENOMEM;
@@ -1093,10 +1316,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
goto out;
}
- err = replace_preds(NULL, call, ps, filter_string);
+ err = replace_preds(call, call->filter, ps, filter_string, false);
if (err)
append_filter_err(ps, call->filter);
-
+ else
+ call->flags |= TRACE_EVENT_FL_FILTERED;
out:
filter_opstack_clear(ps);
postfix_clear(ps);
@@ -1111,16 +1335,18 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
int err;
-
struct filter_parse_state *ps;
mutex_lock(&event_mutex);
+ err = init_subsystem_preds(system);
+ if (err)
+ goto out_unlock;
+
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
- mutex_unlock(&event_mutex);
- return 0;
+ goto out_unlock;
}
err = -ENOMEM;
@@ -1128,7 +1354,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
if (!ps)
goto out_unlock;
- filter_free_subsystem_preds(system);
replace_filter_string(system->filter, filter_string);
parse_init(ps, filter_ops, filter_string);
@@ -1138,7 +1363,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out;
}
- err = replace_preds(system, NULL, ps, filter_string);
+ err = replace_system_preds(system, ps, filter_string);
if (err)
append_filter_err(ps, system->filter);
@@ -1152,3 +1377,73 @@ out_unlock:
return err;
}
+#ifdef CONFIG_PERF_EVENTS
+
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+ struct event_filter *filter = event->filter;
+
+ event->filter = NULL;
+ __free_preds(filter);
+}
+
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+ char *filter_str)
+{
+ int err;
+ struct event_filter *filter;
+ struct filter_parse_state *ps;
+ struct ftrace_event_call *call = NULL;
+
+ mutex_lock(&event_mutex);
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call->event.type == event_id)
+ break;
+ }
+
+ err = -EINVAL;
+ if (&call->list == &ftrace_events)
+ goto out_unlock;
+
+ err = -EEXIST;
+ if (event->filter)
+ goto out_unlock;
+
+ filter = __alloc_preds();
+ if (IS_ERR(filter)) {
+ err = PTR_ERR(filter);
+ goto out_unlock;
+ }
+
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto free_preds;
+
+ parse_init(ps, filter_ops, filter_str);
+ err = filter_parse(ps);
+ if (err)
+ goto free_ps;
+
+ err = replace_preds(call, filter, ps, filter_str, false);
+ if (!err)
+ event->filter = filter;
+
+free_ps:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+
+free_preds:
+ if (err)
+ __free_preds(filter);
+
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc8..4b74d71705c 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,192 +15,159 @@
#include "trace_output.h"
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ftrace
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
-
-extern void __bad_type_size(void);
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
- if (sizeof(type) != sizeof(field.item)) \
- __bad_type_size(); \
- ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
- "offset:%u;\tsize:%u;\n", \
- (unsigned int)offsetof(typeof(field), item), \
- (unsigned int)sizeof(field.item)); \
- if (!ret) \
- return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
- ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
- "offset:%u;\tsize:%u;\n", \
- (unsigned int)offsetof(typeof(field), item), \
- (unsigned int)sizeof(field.item)); \
- if (!ret) \
- return 0;
-
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item) \
- ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
- "offset:%u;\tsize:0;\n", \
- (unsigned int)offsetof(typeof(field), item)); \
- if (!ret) \
- return 0;
-
-#undef TRACE_FIELD_SIGN
-#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
- TRACE_FIELD(type, item, assign)
-
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...) args
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
-static int \
-ftrace_format_##call(struct trace_seq *s) \
-{ \
- struct args field; \
- int ret; \
- \
- tstruct; \
- \
- trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
- \
- return ret; \
-}
+/* not needed for this file */
+#undef __field_struct
+#define __field_struct(type, item)
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
- tpfmt) \
-static int \
-ftrace_format_##call(struct trace_seq *s) \
-{ \
- struct args field; \
- int ret; \
- \
- tstruct; \
- \
- trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
- \
- return ret; \
-}
+#undef __field
+#define __field(type, item) type item;
-#include "trace_event_types.h"
+#undef __field_desc
+#define __field_desc(type, container, item) type item;
-#undef TRACE_ZERO_CHAR
-#define TRACE_ZERO_CHAR(arg)
+#undef __array
+#define __array(type, item, size) type item[size];
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
- entry->item = assign;
+#undef __array_desc
+#define __array_desc(type, container, item, size) type item[size];
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
- entry->item = assign;
+#undef __dynamic_array
+#define __dynamic_array(type, item) type item[];
-#undef TRACE_FIELD_SIGN
-#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
- TRACE_FIELD(type, item, assign)
+#undef F_STRUCT
+#define F_STRUCT(args...) args
-#undef TP_CMD
-#define TP_CMD(cmd...) cmd
+#undef F_printk
+#define F_printk(fmt, args...) fmt, args
-#undef TRACE_ENTRY
-#define TRACE_ENTRY entry
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
+struct ____ftrace_##name { \
+ tstruct \
+}; \
+static void __always_unused ____ftrace_check_##name(void) \
+{ \
+ struct ____ftrace_##name *__entry = NULL; \
+ \
+ /* force compile-time check on F_printk() */ \
+ printk(print); \
+}
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
- cmd;
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
-int ftrace_define_fields_##call(void); \
-static int ftrace_raw_init_event_##call(void); \
- \
-struct ftrace_event_call __used \
-__attribute__((__aligned__(4))) \
-__attribute__((section("_ftrace_events"))) event_##call = { \
- .name = #call, \
- .id = proto, \
- .system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event_##call, \
- .show_format = ftrace_format_##call, \
- .define_fields = ftrace_define_fields_##call, \
-}; \
-static int ftrace_raw_init_event_##call(void) \
-{ \
- INIT_LIST_HEAD(&event_##call.fields); \
- init_preds(&event_##call); \
- return 0; \
-} \
-
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
- tpfmt) \
- \
-struct ftrace_event_call __used \
-__attribute__((__aligned__(4))) \
-__attribute__((section("_ftrace_events"))) event_##call = { \
- .name = #call, \
- .id = proto, \
- .system = __stringify(TRACE_SYSTEM), \
- .show_format = ftrace_format_##call, \
-};
+#include "trace_entries.h"
-#include "trace_event_types.h"
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
+#undef __field
+#define __field(type, item) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
- sizeof(field.item), is_signed_type(type)); \
+ sizeof(field.item), \
+ is_signed_type(type), FILTER_OTHER); \
if (ret) \
return ret;
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+#undef __field_desc
+#define __field_desc(type, container, item) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), \
+ container.item), \
+ sizeof(field.container.item), \
+ is_signed_type(type), FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+#undef __array
+#define __array(type, item, len) \
+ do { \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ mutex_lock(&event_storage_mutex); \
+ snprintf(event_storage, sizeof(event_storage), \
+ "%s[%d]", #type, len); \
+ ret = trace_define_field(event_call, event_storage, #item, \
offsetof(typeof(field), item), \
- sizeof(field.item), 0); \
+ sizeof(field.item), \
+ is_signed_type(type), FILTER_OTHER); \
+ mutex_unlock(&event_storage_mutex); \
+ if (ret) \
+ return ret; \
+ } while (0);
+
+#undef __array_desc
+#define __array_desc(type, container, item, len) \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ offsetof(typeof(field), \
+ container.item), \
+ sizeof(field.container.item), \
+ is_signed_type(type), FILTER_OTHER); \
if (ret) \
return ret;
-#undef TRACE_FIELD_SIGN
-#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+#undef __dynamic_array
+#define __dynamic_array(type, item) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
- sizeof(field.item), is_signed); \
+ 0, is_signed_type(type), FILTER_OTHER);\
if (ret) \
return ret;
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
int \
-ftrace_define_fields_##call(void) \
+ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
{ \
- struct ftrace_event_call *event_call = &event_##call; \
- struct args field; \
+ struct struct_name field; \
int ret; \
\
- __common_field(unsigned char, type, 0); \
- __common_field(unsigned char, flags, 0); \
- __common_field(unsigned char, preempt_count, 0); \
- __common_field(int, pid, 1); \
- __common_field(int, tgid, 1); \
- \
tstruct; \
\
return ret; \
}
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
- tpfmt)
+#include "trace_entries.h"
+
+#undef __entry
+#define __entry REC
+
+#undef __field
+#define __field(type, item)
+
+#undef __field_desc
+#define __field_desc(type, container, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __array_desc
+#define __array_desc(type, container, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item)
+
+#undef F_printk
+#define F_printk(fmt, args...) #fmt ", " __stringify(args)
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
+ \
+struct ftrace_event_class event_class_ftrace_##call = { \
+ .system = __stringify(TRACE_SYSTEM), \
+ .define_fields = ftrace_define_fields_##call, \
+ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
+}; \
+ \
+struct ftrace_event_call __used \
+__attribute__((__aligned__(4))) \
+__attribute__((section("_ftrace_events"))) event_##call = { \
+ .name = #call, \
+ .event.type = etype, \
+ .class = &event_class_ftrace_##call, \
+ .print_fmt = print, \
+}; \
-#include "trace_event_types.h"
+#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 90f13476483..16aee4d44e8 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
- int cpu, resched;
+ int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
pc = preempt_count();
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
trace_function(tr, ip, parent_ip, flags, pc);
atomic_dec(&data->disabled);
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static void
@@ -288,11 +288,9 @@ static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
- char str[KSYM_SYMBOL_LEN];
long count = (long)data;
- kallsyms_lookup(ip, NULL, NULL, NULL, str);
- seq_printf(m, "%s:", str);
+ seq_printf(m, "%ps:", (void *)ip);
if (ops == &traceon_probe_ops)
seq_printf(m, "traceon");
@@ -302,8 +300,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
if (count == -1)
seq_printf(m, ":unlimited\n");
else
- seq_printf(m, ":count=%ld", count);
- seq_putc(m, '\n');
+ seq_printf(m, ":count=%ld\n", count);
return 0;
}
@@ -364,7 +361,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
out_reg:
ret = register_ftrace_function_probe(glob, ops, count);
- return ret;
+ return ret < 0 ? ret : 0;
}
static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb5..76b05980225 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,14 +9,31 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/slab.h>
#include <linux/fs.h>
#include "trace.h"
#include "trace_output.h"
-struct fgraph_data {
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
+
+struct fgraph_cpu_data {
pid_t last_pid;
int depth;
+ int depth_irq;
+ int ignore;
+ unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
+};
+
+struct fgraph_data {
+ struct fgraph_cpu_data __percpu *cpu_data;
+
+ /* Place to preserve last processed entry. */
+ struct ftrace_graph_ent_entry ent;
+ struct ftrace_graph_ret_entry ret;
+ int failed;
+ int cpu;
};
#define TRACE_GRAPH_INDENT 2
@@ -27,7 +44,8 @@ struct fgraph_data {
#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+#define TRACE_GRAPH_PRINT_IRQS 0x40
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
@@ -42,17 +60,19 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
/* Display absolute time of an entry */
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+ /* Display interrupts */
+ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns and proc by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
- TRACE_GRAPH_PRINT_DURATION,
+ TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
.opts = trace_opts
};
-/* pid on the last trace processed */
+static struct trace_array *graph_array;
/* Add a function return address to the trace stack on thread info.*/
@@ -124,7 +144,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
ftrace_graph_stop();
WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
- " from func %pF return to %lx\n",
+ " from func %ps return to %lx\n",
current->ret_stack[index].fp,
frame_pointer,
(void *)current->ret_stack[index].func,
@@ -166,10 +186,183 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
+int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer = tr->buffer;
+ struct ftrace_graph_ent_entry *entry;
+
+ if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
+ return 0;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return 0;
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ ring_buffer_unlock_commit(buffer, event);
+
+ return 1;
+}
+
+static inline int ftrace_graph_ignore_irqs(void)
+{
+ if (!ftrace_graph_skip_irqs)
+ return 0;
+
+ return in_irq();
+}
+
+int trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int ret;
+ int cpu;
+ int pc;
+
+ if (!ftrace_trace_task(current))
+ return 0;
+
+ /* trace it when it is-nested-in or is a function enabled. */
+ if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+ ftrace_graph_ignore_irqs())
+ return 0;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ } else {
+ ret = 0;
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+ if (tracing_thresh)
+ return 1;
+ else
+ return trace_graph_entry(trace);
+}
+
+static void
+__trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long flags, int pc)
+{
+ u64 time = trace_clock_local();
+ struct ftrace_graph_ent ent = {
+ .func = ip,
+ .depth = 0,
+ };
+ struct ftrace_graph_ret ret = {
+ .func = ip,
+ .depth = 0,
+ .calltime = time,
+ .rettime = time,
+ };
+
+ __trace_graph_entry(tr, &ent, flags, pc);
+ __trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ __trace_graph_function(tr, ip, flags, pc);
+}
+
+void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_exit;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer = tr->buffer;
+ struct ftrace_graph_ret_entry *entry;
+
+ if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
+ return;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->ret = *trace;
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ ring_buffer_unlock_commit(buffer, event);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ }
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+void set_graph_array(struct trace_array *tr)
+{
+ graph_array = tr;
+
+ /* Make graph_array visible before we start tracing */
+
+ smp_mb();
+}
+
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+ if (tracing_thresh &&
+ (trace->rettime - trace->calltime < tracing_thresh))
+ return;
+ else
+ trace_graph_return(trace);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
- int ret = register_ftrace_graph(&trace_graph_return,
- &trace_graph_entry);
+ int ret;
+
+ set_graph_array(tr);
+ if (tracing_thresh)
+ ret = register_ftrace_graph(&trace_graph_thresh_return,
+ &trace_graph_thresh_entry);
+ else
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -183,43 +376,19 @@ static void graph_trace_reset(struct trace_array *tr)
unregister_ftrace_graph();
}
-static inline int log10_cpu(int nb)
-{
- if (nb / 100)
- return 3;
- if (nb / 10)
- return 2;
- return 1;
-}
+static int max_bytes_for_cpu;
static enum print_line_t
print_graph_cpu(struct trace_seq *s, int cpu)
{
- int i;
int ret;
- int log10_this = log10_cpu(cpu);
- int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " ");
-
- /*
- * Tricky - we space the CPU field according to the max
- * number of online CPUs. On a 2-cpu system it would take
- * a maximum of 1 digit - on a 128 cpu system it would
- * take up to 3 digits:
- */
- for (i = 0; i < log10_all - log10_this; i++) {
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = trace_seq_printf(s, "%d) ", cpu);
+ ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -270,6 +439,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
}
+static enum print_line_t
+print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+{
+ if (!trace_seq_putc(s, ' '))
+ return 0;
+
+ return trace_print_lat_fmt(s, entry);
+}
+
/* If the pid changed since the last trace, output this event */
static enum print_line_t
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -281,7 +459,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
if (!data)
return TRACE_TYPE_HANDLED;
- last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+ last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
if (*last_pid == pid)
return TRACE_TYPE_HANDLED;
@@ -332,27 +510,59 @@ static struct ftrace_graph_ret_entry *
get_return_for_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *curr)
{
- struct ring_buffer_iter *ring_iter;
+ struct fgraph_data *data = iter->private;
+ struct ring_buffer_iter *ring_iter = NULL;
struct ring_buffer_event *event;
struct ftrace_graph_ret_entry *next;
- ring_iter = iter->buffer_iter[iter->cpu];
+ /*
+ * If the previous output failed to write to the seq buffer,
+ * then we just reuse the data from before.
+ */
+ if (data && data->failed) {
+ curr = &data->ent;
+ next = &data->ret;
+ } else {
+
+ ring_iter = iter->buffer_iter[iter->cpu];
+
+ /* First peek to compare current entry and the next one */
+ if (ring_iter)
+ event = ring_buffer_iter_peek(ring_iter, NULL);
+ else {
+ /*
+ * We need to consume the current entry to see
+ * the next one.
+ */
+ ring_buffer_consume(iter->tr->buffer, iter->cpu,
+ NULL, NULL);
+ event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+ NULL, NULL);
+ }
- /* First peek to compare current entry and the next one */
- if (ring_iter)
- event = ring_buffer_iter_peek(ring_iter, NULL);
- else {
- /* We need to consume the current entry to see the next one */
- ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
- event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
- NULL);
+ if (!event)
+ return NULL;
+
+ next = ring_buffer_event_data(event);
+
+ if (data) {
+ /*
+ * Save current and next entries for later reference
+ * if the output fails.
+ */
+ data->ent = *curr;
+ /*
+ * If the next event is not a return type, then
+ * we only care about what type it is. Otherwise we can
+ * safely copy the entire event.
+ */
+ if (next->ent.type == TRACE_GRAPH_RET)
+ data->ret = *next;
+ else
+ data->ret.ent.type = next->ent.type;
+ }
}
- if (!event)
- return NULL;
-
- next = ring_buffer_event_data(event);
-
if (next->ent.type != TRACE_GRAPH_RET)
return NULL;
@@ -369,17 +579,18 @@ get_return_for_leaf(struct trace_iterator *iter,
/* Signal a overhead of time execution to the output */
static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+ u32 flags)
{
/* If duration disappear, we don't need anything */
- if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+ if (!(flags & TRACE_GRAPH_PRINT_DURATION))
return 1;
/* Non nested entry or return */
if (duration == -1)
return trace_seq_printf(s, " ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
return trace_seq_printf(s, "! ");
@@ -405,7 +616,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
static enum print_line_t
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
- enum trace_type type, int cpu, pid_t pid)
+ enum trace_type type, int cpu, pid_t pid, u32 flags)
{
int ret;
struct trace_seq *s = &iter->seq;
@@ -415,20 +626,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
return TRACE_TYPE_UNHANDLED;
/* Absolute time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ if (flags & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
+
/* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -438,7 +650,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* No overhead */
- ret = print_graph_overhead(-1, s);
+ ret = print_graph_overhead(-1, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -451,7 +663,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
return TRACE_TYPE_PARTIAL_LINE;
/* Don't close the duration column if haven't one */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
trace_seq_printf(s, " |");
ret = trace_seq_printf(s, "\n");
@@ -481,7 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
/* Print nsecs (we don't want to exceed 7 numbers) */
if (len < 7) {
- snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+ size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
+
+ snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
ret = trace_seq_printf(s, ".%s", nsecs_str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -521,7 +735,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry,
- struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+ struct ftrace_graph_ret_entry *ret_entry,
+ struct trace_seq *s, u32 flags)
{
struct fgraph_data *data = iter->private;
struct ftrace_graph_ret *graph_ret;
@@ -535,24 +750,30 @@ print_graph_entry_leaf(struct trace_iterator *iter,
duration = graph_ret->rettime - graph_ret->calltime;
if (data) {
+ struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
- int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+ cpu_data = per_cpu_ptr(data->cpu_data, cpu);
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
* equal to this depth.
*/
- *depth = call->depth - 1;
+ cpu_data->depth = call->depth - 1;
+
+ /* No need to keep this function around for this depth */
+ if (call->depth < FTRACE_RETFUNC_DEPTH)
+ cpu_data->enter_funcs[call->depth] = 0;
}
/* Overhead */
- ret = print_graph_overhead(duration, s);
+ ret = print_graph_overhead(duration, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -565,11 +786,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, call->func, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, "();\n");
+ ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -579,7 +796,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
static enum print_line_t
print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry,
- struct trace_seq *s, int cpu)
+ struct trace_seq *s, int cpu, u32 flags)
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
@@ -587,19 +804,24 @@ print_graph_entry_nested(struct trace_iterator *iter,
int i;
if (data) {
+ struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
- int *depth = &(per_cpu_ptr(data, cpu)->depth);
- *depth = call->depth;
+ cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ cpu_data->depth = call->depth;
+
+ /* Save this function pointer to see if the exit matches */
+ if (call->depth < FTRACE_RETFUNC_DEPTH)
+ cpu_data->enter_funcs[call->depth] = call->func;
}
/* No overhead */
- ret = print_graph_overhead(-1, s);
+ ret = print_graph_overhead(-1, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -612,11 +834,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, call->func, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, "() {\n");
+ ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -629,7 +847,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
static enum print_line_t
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
- int type, unsigned long addr)
+ int type, unsigned long addr, u32 flags)
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
@@ -642,27 +860,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
if (type) {
/* Interrupt */
- ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+ ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Absolute time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ if (flags & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -672,61 +890,201 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ ret = print_graph_lat_fmt(s, ent);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
return 0;
}
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ * - we are inside irq code
+ * - we just extered irq code
+ *
+ * retunns 0 if
+ * - funcgraph-interrupts option is set
+ * - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+ unsigned long addr, int depth)
+{
+ int cpu = iter->cpu;
+ int *depth_irq;
+ struct fgraph_data *data = iter->private;
+
+ /*
+ * If we are either displaying irqs, or we got called as
+ * a graph event and private data does not exist,
+ * then we bypass the irq check.
+ */
+ if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+ (!data))
+ return 0;
+
+ depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+ /*
+ * We are inside the irq code
+ */
+ if (*depth_irq >= 0)
+ return 1;
+
+ if ((addr < (unsigned long)__irqentry_text_start) ||
+ (addr >= (unsigned long)__irqentry_text_end))
+ return 0;
+
+ /*
+ * We are entering irq code.
+ */
+ *depth_irq = depth;
+ return 1;
+}
+
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ * - we are inside irq code
+ * - we just left irq code
+ *
+ * returns 0 if
+ * - funcgraph-interrupts option is set
+ * - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+ int cpu = iter->cpu;
+ int *depth_irq;
+ struct fgraph_data *data = iter->private;
+
+ /*
+ * If we are either displaying irqs, or we got called as
+ * a graph event and private data does not exist,
+ * then we bypass the irq check.
+ */
+ if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+ (!data))
+ return 0;
+
+ depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+ /*
+ * We are not inside the irq code.
+ */
+ if (*depth_irq == -1)
+ return 0;
+
+ /*
+ * We are inside the irq code, and this is returning entry.
+ * Let's not trace it and clear the entry depth, since
+ * we are out of irq code.
+ *
+ * This condition ensures that we 'leave the irq code' once
+ * we are out of the entry depth. Thus protecting us from
+ * the RETURN entry loss.
+ */
+ if (*depth_irq >= depth) {
+ *depth_irq = -1;
+ return 1;
+ }
+
+ /*
+ * We are inside the irq code, and this is not the entry.
+ */
+ return 1;
+}
+
static enum print_line_t
print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
- struct trace_iterator *iter)
+ struct trace_iterator *iter, u32 flags)
{
- int cpu = iter->cpu;
+ struct fgraph_data *data = iter->private;
struct ftrace_graph_ent *call = &field->graph_ent;
struct ftrace_graph_ret_entry *leaf_ret;
+ static enum print_line_t ret;
+ int cpu = iter->cpu;
+
+ if (check_irq_entry(iter, flags, call->func, call->depth))
+ return TRACE_TYPE_HANDLED;
- if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+ if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
return TRACE_TYPE_PARTIAL_LINE;
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
- return print_graph_entry_leaf(iter, field, leaf_ret, s);
+ ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
else
- return print_graph_entry_nested(iter, field, s, cpu);
+ ret = print_graph_entry_nested(iter, field, s, cpu, flags);
+
+ if (data) {
+ /*
+ * If we failed to write our output, then we need to make
+ * note of it. Because we already consumed our entry.
+ */
+ if (s->full) {
+ data->failed = 1;
+ data->cpu = cpu;
+ } else
+ data->failed = 0;
+ }
+ return ret;
}
static enum print_line_t
print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
- struct trace_entry *ent, struct trace_iterator *iter)
+ struct trace_entry *ent, struct trace_iterator *iter,
+ u32 flags)
{
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
pid_t pid = ent->pid;
int cpu = iter->cpu;
+ int func_match = 1;
int ret;
int i;
+ if (check_irq_return(iter, flags, trace->depth))
+ return TRACE_TYPE_HANDLED;
+
if (data) {
+ struct fgraph_cpu_data *cpu_data;
int cpu = iter->cpu;
- int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+ cpu_data = per_cpu_ptr(data->cpu_data, cpu);
/*
* Comments display at + 1 to depth. This is the
* return from a function, we now want the comments
* to display at the same level of the bracket.
*/
- *depth = trace->depth - 1;
+ cpu_data->depth = trace->depth - 1;
+
+ if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ if (cpu_data->enter_funcs[trace->depth] != trace->func)
+ func_match = 0;
+ cpu_data->enter_funcs[trace->depth] = 0;
+ }
}
- if (print_graph_prologue(iter, s, 0, 0))
+ if (print_graph_prologue(iter, s, 0, 0, flags))
return TRACE_TYPE_PARTIAL_LINE;
/* Overhead */
- ret = print_graph_overhead(duration, s);
+ ret = print_graph_overhead(duration, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -739,19 +1097,32 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = trace_seq_printf(s, "}\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ /*
+ * If the return function does not have a matching entry,
+ * then the entry was lost. Instead of just printing
+ * the '}' and letting the user guess what function this
+ * belongs to, write out the function name.
+ */
+ if (func_match) {
+ ret = trace_seq_printf(s, "}\n");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ } else {
+ ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
/* Overrun */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
ret = trace_seq_printf(s, " (Overruns: %lu)\n",
trace->overrun);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+ ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -759,8 +1130,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
static enum print_line_t
-print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
- struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+ struct trace_iterator *iter, u32 flags)
{
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct fgraph_data *data = iter->private;
@@ -770,18 +1141,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
int i;
if (data)
- depth = per_cpu_ptr(data, iter->cpu)->depth;
+ depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
- if (print_graph_prologue(iter, s, 0, 0))
+ if (print_graph_prologue(iter, s, 0, 0, flags))
return TRACE_TYPE_PARTIAL_LINE;
/* No overhead */
- ret = print_graph_overhead(-1, s);
+ ret = print_graph_overhead(-1, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -816,7 +1187,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (!event)
return TRACE_TYPE_UNHANDLED;
- ret = event->trace(iter, sym_flags);
+ ret = event->funcs->trace(iter, sym_flags, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -836,90 +1207,253 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
+ struct ftrace_graph_ent_entry *field;
+ struct fgraph_data *data = iter->private;
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
+ int cpu = iter->cpu;
+ int ret;
+
+ if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+ per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+ return TRACE_TYPE_HANDLED;
+ }
+
+ /*
+ * If the last output failed, there's a possibility we need
+ * to print out the missing entry which would never go out.
+ */
+ if (data && data->failed) {
+ field = &data->ent;
+ iter->cpu = data->cpu;
+ ret = print_graph_entry(field, s, iter, flags);
+ if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+ per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+ ret = TRACE_TYPE_NO_CONSUME;
+ }
+ iter->cpu = cpu;
+ return ret;
+ }
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- struct ftrace_graph_ent_entry *field;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry saved;
trace_assign_type(field, entry);
- return print_graph_entry(field, s, iter);
+ saved = *field;
+ return print_graph_entry(&saved, s, iter, flags);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter);
+ return print_graph_return(&field->ret, s, entry, iter, flags);
}
+ case TRACE_STACK:
+ case TRACE_FN:
+ /* dont trace stack and functions as comments */
+ return TRACE_TYPE_UNHANDLED;
+
default:
- return print_graph_comment(s, entry, iter);
+ return print_graph_comment(s, entry, iter, flags);
}
return TRACE_TYPE_HANDLED;
}
-static void print_graph_headers(struct seq_file *s)
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+ return __print_graph_function_flags(iter, tracer_flags.val);
+}
+
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+ u32 flags)
+{
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ flags |= TRACE_GRAPH_PRINT_DURATION;
+ else
+ flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+ return __print_graph_function_flags(iter, flags);
+}
+
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return print_graph_function(iter);
+}
+
+static void print_lat_header(struct seq_file *s, u32 flags)
+{
+ static const char spaces[] = " " /* 16 spaces */
+ " " /* 4 spaces */
+ " "; /* 17 spaces */
+ int size = 0;
+
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ size += 16;
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ size += 4;
+ if (flags & TRACE_GRAPH_PRINT_PROC)
+ size += 17;
+
+ seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
+ seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
+ seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
+ seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
+ seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
+ seq_printf(s, "#%.*s|||| / \n", size, spaces);
+}
+
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
{
+ int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
+
+ if (lat)
+ print_lat_header(s, flags);
+
/* 1st line */
- seq_printf(s, "# ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ seq_printf(s, "#");
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " TIME ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, "CPU");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " TASK/PID ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ seq_printf(s, " CPU");
+ if (flags & TRACE_GRAPH_PRINT_PROC)
+ seq_printf(s, " TASK/PID ");
+ if (lat)
+ seq_printf(s, "|||||");
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " DURATION ");
seq_printf(s, " FUNCTION CALLS\n");
/* 2nd line */
- seq_printf(s, "# ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ seq_printf(s, "#");
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " | ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, "| ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " | | ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ seq_printf(s, " | ");
+ if (flags & TRACE_GRAPH_PRINT_PROC)
+ seq_printf(s, " | | ");
+ if (lat)
+ seq_printf(s, "|||||");
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " | | ");
seq_printf(s, " | | | |\n");
}
-static void graph_trace_open(struct trace_iterator *iter)
+void print_graph_headers(struct seq_file *s)
+{
+ print_graph_headers_flags(s, tracer_flags.val);
+}
+
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+ struct trace_iterator *iter = s->private;
+
+ if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+
+ print_trace_header(s, iter);
+ flags |= TRACE_GRAPH_PRINT_DURATION;
+ } else
+ flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+ __print_graph_headers_flags(s, flags);
+}
+
+void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
- struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+ struct fgraph_data *data;
int cpu;
+ iter->private = NULL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
- pr_warning("function graph tracer: not enough memory\n");
- else
- for_each_possible_cpu(cpu) {
- pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
- int *depth = &(per_cpu_ptr(data, cpu)->depth);
- *pid = -1;
- *depth = 0;
- }
+ goto out_err;
+
+ data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
+ if (!data->cpu_data)
+ goto out_err_free;
+
+ for_each_possible_cpu(cpu) {
+ pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+ int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+ int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+ int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+ *pid = -1;
+ *depth = 0;
+ *ignore = 0;
+ *depth_irq = -1;
+ }
iter->private = data;
+
+ return;
+
+ out_err_free:
+ kfree(data);
+ out_err:
+ pr_warning("function graph tracer: not enough memory\n");
+}
+
+void graph_trace_close(struct trace_iterator *iter)
+{
+ struct fgraph_data *data = iter->private;
+
+ if (data) {
+ free_percpu(data->cpu_data);
+ kfree(data);
+ }
}
-static void graph_trace_close(struct trace_iterator *iter)
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
{
- free_percpu(iter->private);
+ if (bit == TRACE_GRAPH_PRINT_IRQS)
+ ftrace_graph_skip_irqs = !set;
+
+ return 0;
}
+static struct trace_event_functions graph_functions = {
+ .trace = print_graph_function_event,
+};
+
+static struct trace_event graph_trace_entry_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &graph_functions,
+};
+
+static struct trace_event graph_trace_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &graph_functions
+};
+
static struct tracer graph_trace __read_mostly = {
.name = "function_graph",
.open = graph_trace_open,
+ .pipe_open = graph_trace_open,
.close = graph_trace_close,
+ .pipe_close = graph_trace_close,
.wait_pipe = poll_wait_pipe,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
.flags = &tracer_flags,
+ .set_flag = func_graph_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
@@ -927,6 +1461,18 @@ static struct tracer graph_trace __read_mostly = {
static __init int init_graph_trace(void)
{
+ max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+
+ if (!register_ftrace_event(&graph_trace_entry_event)) {
+ pr_warning("Warning: could not register graph trace events\n");
+ return 1;
+ }
+
+ if (!register_ftrace_event(&graph_trace_ret_event)) {
+ pr_warning("Warning: could not register graph trace events\n");
+ return 1;
+ }
+
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index ca7d7c4d0c2..00000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
-
-#define this_tracer per_cpu(tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
- per_cpu(tracer, cpu) =
- ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
-
- if (IS_ERR(per_cpu(tracer, cpu)))
- per_cpu(tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
- int cpu;
-
- hw_branch_trace = tr;
- trace_hw_branches_enabled = 0;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- bts_trace_init_cpu(cpu);
-
- if (likely(per_cpu(tracer, cpu)))
- trace_hw_branches_enabled = 1;
- }
- trace_hw_branches_suspended = 0;
- put_online_cpus();
-
- /* If we could not enable tracing on a single cpu, we fail. */
- return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- if (likely(per_cpu(tracer, cpu))) {
- ds_release_bts(per_cpu(tracer, cpu));
- per_cpu(tracer, cpu) = NULL;
- }
- }
- trace_hw_branches_enabled = 0;
- trace_hw_branches_suspended = 0;
- put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (likely(per_cpu(tracer, cpu)))
- ds_resume_bts(per_cpu(tracer, cpu));
- trace_hw_branches_suspended = 0;
- put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (likely(per_cpu(tracer, cpu)))
- ds_suspend_bts(per_cpu(tracer, cpu));
- trace_hw_branches_suspended = 1;
- put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- /* The notification is sent with interrupts enabled. */
- if (trace_hw_branches_enabled) {
- bts_trace_init_cpu(cpu);
-
- if (trace_hw_branches_suspended &&
- likely(per_cpu(tracer, cpu)))
- ds_suspend_bts(per_cpu(tracer, cpu));
- }
- break;
-
- case CPU_DOWN_PREPARE:
- /* The notification is sent with interrupts enabled. */
- if (likely(per_cpu(tracer, cpu))) {
- ds_release_bts(per_cpu(tracer, cpu));
- per_cpu(tracer, cpu) = NULL;
- }
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
- .notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
- seq_puts(m, "# CPU# TO <- FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
- unsigned long symflags = TRACE_ITER_SYM_OFFSET;
- struct trace_entry *entry = iter->ent;
- struct trace_seq *seq = &iter->seq;
- struct hw_branch_entry *it;
-
- trace_assign_type(it, entry);
-
- if (entry->type == TRACE_HW_BRANCHES) {
- if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
- seq_print_ip_sym(seq, it->to, symflags) &&
- trace_seq_printf(seq, "\t <- ") &&
- seq_print_ip_sym(seq, it->from, symflags) &&
- trace_seq_printf(seq, "\n"))
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;;
- }
- return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
- struct ftrace_event_call *call = &event_hw_branch;
- struct trace_array *tr = hw_branch_trace;
- struct ring_buffer_event *event;
- struct hw_branch_entry *entry;
- unsigned long irq1;
- int cpu;
-
- if (unlikely(!tr))
- return;
-
- if (unlikely(!trace_hw_branches_enabled))
- return;
-
- local_irq_save(irq1);
- cpu = raw_smp_processor_id();
- if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
- goto out;
-
- event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, from);
- entry->ent.type = TRACE_HW_BRANCHES;
- entry->from = from;
- entry->to = to;
- if (!filter_check_discard(call, entry, tr->buffer, event))
- trace_buffer_unlock_commit(tr, event, 0, 0);
-
- out:
- atomic_dec(&tr->data[cpu]->disabled);
- local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
- struct bts_struct bts;
- int err = 0;
-
- WARN_ON_ONCE(!trace->read);
- if (!trace->read)
- return;
-
- err = trace->read(this_tracer, at, &bts);
- if (err < 0)
- return;
-
- switch (bts.qualifier) {
- case BTS_BRANCH:
- trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
- break;
- }
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
- struct trace_array *tr = (struct trace_array *)arg;
- const struct bts_trace *trace;
- unsigned char *at;
-
- if (unlikely(!tr))
- return;
-
- if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
- return;
-
- if (unlikely(!this_tracer))
- return;
-
- trace = ds_read_bts(this_tracer);
- if (!trace)
- return;
-
- for (at = trace->ds.top; (void *)at < trace->ds.end;
- at += trace->ds.size)
- trace_bts_at(trace, at);
-
- for (at = trace->ds.begin; (void *)at < trace->ds.top;
- at += trace->ds.size)
- trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (likely(per_cpu(tracer, cpu)))
- ds_suspend_bts(per_cpu(tracer, cpu));
- /*
- * We need to collect the trace on the respective cpu since ftrace
- * implicitly adds the record for the current cpu.
- * Once that is more flexible, we could collect the data from any cpu.
- */
- on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
- for_each_online_cpu(cpu)
- if (likely(per_cpu(tracer, cpu)))
- ds_resume_bts(per_cpu(tracer, cpu));
- put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
- tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
- if (this_tracer) {
- ds_suspend_bts_noirq(this_tracer);
- trace_bts_cpu(hw_branch_trace);
- ds_resume_bts_noirq(this_tracer);
- }
-}
-
-struct tracer bts_tracer __read_mostly =
-{
- .name = "hw-branch-tracer",
- .init = bts_trace_init,
- .reset = bts_trace_reset,
- .print_header = bts_trace_print_header,
- .print_line = bts_trace_print_line,
- .start = bts_trace_start,
- .stop = bts_trace_stop,
- .open = trace_bts_prepare,
- .close = trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
- register_hotcpu_notifier(&bts_hotcpu_notifier);
- return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fa..5cf8c602b88 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
static int save_lat_flag;
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
#ifdef CONFIG_PREEMPT_TRACER
static inline int
preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
# define irq_trace() (0)
#endif
+#define TRACE_DISPLAY_GRAPH 1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* display latency trace as call graph */
+ { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ .val = 0,
+ .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
/*
* Sequence count - we record it when starting a measurement and
* skip the latency if the sequence has changed - some other section
@@ -67,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
#ifdef CONFIG_FUNCTION_TRACER
/*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ * incremented.
+ * 0 if the trace is to be ignored, and data->disabled
+ * is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ * inside the #ifdef of the function graph tracer below.
+ * This is OK, since the function graph tracer is
+ * dependent on the function tracer.
*/
-static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int func_prolog_dec(struct trace_array *tr,
+ struct trace_array_cpu **data,
+ unsigned long *flags)
{
- struct trace_array *tr = irqsoff_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
long disabled;
int cpu;
@@ -86,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
*/
cpu = raw_smp_processor_id();
if (likely(!per_cpu(tracing_cpu, cpu)))
- return;
+ return 0;
- local_save_flags(flags);
+ local_save_flags(*flags);
/* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(flags))
- return;
+ if (!irqs_disabled_flags(*flags))
+ return 0;
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ *data = tr->data[cpu];
+ disabled = atomic_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ return 1;
+
+ atomic_dec(&(*data)->disabled);
+
+ return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return;
+
+ trace_function(tr, ip, parent_ip, flags, preempt_count());
atomic_dec(&data->disabled);
}
@@ -108,6 +156,132 @@ static struct ftrace_ops trace_ops __read_mostly =
};
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+ int cpu;
+
+ if (!(bit & TRACE_DISPLAY_GRAPH))
+ return -EINVAL;
+
+ if (!(is_graph() ^ set))
+ return 0;
+
+ stop_irqsoff_tracer(irqsoff_trace, !set);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(tracing_cpu, cpu) = 0;
+
+ tracing_max_latency = 0;
+ tracing_reset_online_cpus(irqsoff_trace);
+
+ return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int ret;
+ int pc;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return 0;
+
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+
+ return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return;
+
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+ if (is_graph())
+ graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+ if (iter->private)
+ graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+ TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+ /*
+ * In graph mode call the graph tracer output function,
+ * otherwise go with the TRACE_FN event handler
+ */
+ if (is_graph())
+ return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+ if (is_graph())
+ print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+ else
+ trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ if (is_graph())
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
+ trace_function(tr, ip, parent_ip, flags, pc);
+}
+
+#else
+#define __trace_function trace_function
+
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+ return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
/*
* Should this new latency be reported/recorded?
*/
@@ -129,15 +303,10 @@ check_critical_timing(struct trace_array *tr,
unsigned long parent_ip,
int cpu)
{
- unsigned long latency, t0, t1;
cycle_t T0, T1, delta;
unsigned long flags;
int pc;
- /*
- * usecs conversion is slow so we try to delay the conversion
- * as long as possible:
- */
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
delta = T1-T0;
@@ -155,20 +324,19 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(delta))
goto out_unlock;
- trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
-
- latency = nsecs_to_usecs(delta);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ /* Skip 5 functions to get to the irq/preempt enable function */
+ __trace_stack(tr, flags, 5, pc);
if (data->critical_sequence != max_sequence)
goto out_unlock;
- tracing_max_latency = delta;
- t0 = nsecs_to_usecs(T0);
- t1 = nsecs_to_usecs(T1);
-
data->critical_end = parent_ip;
- update_max_tr_single(tr, current, cpu);
+ if (likely(!is_tracing_stopped())) {
+ tracing_max_latency = delta;
+ update_max_tr_single(tr, current, cpu);
+ }
max_sequence++;
@@ -178,8 +346,7 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- tracing_reset(tr, cpu);
- trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
static inline void
@@ -208,11 +375,10 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- tracing_reset(tr, cpu);
local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ __trace_function(tr, ip, parent_ip, flags, preempt_count());
per_cpu(tracing_cpu, cpu) = 1;
@@ -246,7 +412,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
atomic_inc(&data->disabled);
local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ __trace_function(tr, ip, parent_ip, flags, preempt_count());
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -355,19 +521,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
{
- register_ftrace_function(&trace_ops);
- if (tracing_is_enabled())
+ int ret = 0;
+
+ if (!graph)
+ ret = register_ftrace_function(&trace_ops);
+ else
+ ret = register_ftrace_graph(&irqsoff_graph_return,
+ &irqsoff_graph_entry);
+
+ if (!ret && tracing_is_enabled())
tracer_enabled = 1;
else
tracer_enabled = 0;
+
+ return ret;
}
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- unregister_ftrace_function(&trace_ops);
+
+ if (!graph)
+ unregister_ftrace_function(&trace_ops);
+ else
+ unregister_ftrace_graph();
}
static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -379,12 +558,15 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
- start_irqsoff_tracer(tr);
+ tracing_reset_online_cpus(tr);
+
+ if (start_irqsoff_tracer(tr, is_graph()))
+ printk(KERN_ERR "failed to start irqsoff tracer\n");
}
static void irqsoff_tracer_reset(struct trace_array *tr)
{
- stop_irqsoff_tracer(tr);
+ stop_irqsoff_tracer(tr, is_graph());
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -416,9 +598,16 @@ static struct tracer irqsoff_tracer __read_mostly =
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = 1,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
+ .use_max_tr = 1,
};
# define register_irqsoff(trace) register_tracer(&trace)
#else
@@ -442,9 +631,16 @@ static struct tracer preemptoff_tracer __read_mostly =
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = 1,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
+ .use_max_tr = 1,
};
# define register_preemptoff(trace) register_tracer(&trace)
#else
@@ -470,9 +666,16 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = 1,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
+ .use_max_tr = 1,
};
# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 00000000000..3c5c5dfea0b
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,135 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
+{
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ unsigned int old_userobj;
+ int cnt = 0, cpu;
+
+ trace_init_global_iter(&iter);
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&iter.tr->data[cpu]->disabled);
+ }
+
+ old_userobj = trace_flags;
+
+ /* don't look at user memory in panic mode */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+ kdb_printf("Dumping ftrace buffer:\n");
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ for_each_tracing_cpu(cpu) {
+ iter.buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu);
+ ring_buffer_read_start(iter.buffer_iter[cpu]);
+ tracing_iter_reset(&iter, cpu);
+ }
+ } else {
+ iter.cpu_file = cpu_file;
+ iter.buffer_iter[cpu_file] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+ ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+ tracing_iter_reset(&iter, cpu_file);
+ }
+ if (!trace_empty(&iter))
+ trace_find_next_entry_inc(&iter);
+ while (!trace_empty(&iter)) {
+ if (!cnt)
+ kdb_printf("---------------------------------\n");
+ cnt++;
+
+ if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ print_trace_line(&iter);
+ if (!skip_lines)
+ trace_printk_seq(&iter.seq);
+ else
+ skip_lines--;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ goto out;
+ }
+
+ if (!cnt)
+ kdb_printf(" (ftrace buffer empty)\n");
+ else
+ kdb_printf("---------------------------------\n");
+
+out:
+ trace_flags = old_userobj;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
+ }
+
+ for_each_tracing_cpu(cpu)
+ if (iter.buffer_iter[cpu])
+ ring_buffer_read_finish(iter.buffer_iter[cpu]);
+}
+
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+ int skip_lines = 0;
+ long cpu_file;
+ char *cp;
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ skip_lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ skip_lines = 0;
+ }
+
+ if (argc == 2) {
+ cpu_file = simple_strtol(argv[2], &cp, 0);
+ if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ !cpu_online(cpu_file))
+ return KDB_BADINT;
+ } else {
+ cpu_file = TRACE_PIPE_ALL_CPU;
+ }
+
+ kdb_trap_printk++;
+ ftrace_dump_buf(skip_lines, cpu_file);
+ kdb_trap_printk--;
+
+ return 0;
+}
+
+static __init int kdb_ftrace_register(void)
+{
+ kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_REPEAT_NONE);
+ return 0;
+}
+
+late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 00000000000..2dec9bcde8b
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1847 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <asm/bitsperlong.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
+#define KPROBE_EVENT_SYSTEM "kprobes"
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+const char *reserved_field_names[] = {
+ "common_type",
+ "common_flags",
+ "common_preempt_count",
+ "common_pid",
+ "common_tgid",
+ "common_lock_depth",
+ FIELD_STRING_IP,
+ FIELD_STRING_RETIP,
+ FIELD_STRING_FUNC,
+};
+
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
+ void *);
+#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
+#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
+
+/* Printing in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
+ const char *name, \
+ void *data, void *ent)\
+{ \
+ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+} \
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs) \
+ (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl) ((u32)(dl) >> 16)
+#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
+
+static inline void *get_rloc_data(u32 *dl)
+{
+ return (u8 *)dl + get_rloc_offs(*dl);
+}
+
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+ return (u8 *)ent + get_rloc_offs(*dl);
+}
+
+/*
+ * Convert data_rloc to data_loc:
+ * data_rloc stores the offset from data_rloc itself, but data_loc
+ * stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
+
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+ const char *name,
+ void *data, void *ent)
+{
+ int len = *(u32 *)data >> 16;
+
+ if (!len)
+ return trace_seq_printf(s, " %s=(fault)", name);
+ else
+ return trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+}
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+
+struct fetch_param {
+ fetch_func_t fn;
+ void *data;
+};
+
+static __kprobes void call_fetch(struct fetch_param *fprm,
+ struct pt_regs *regs, void *dest)
+{
+ return fprm->fn(regs, fprm->data, dest);
+}
+
+#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8) \
+DEFINE_FETCH_##method(u16) \
+DEFINE_FETCH_##method(u32) \
+DEFINE_FETCH_##method(u64)
+
+#define CHECK_FETCH_FUNCS(method, fn) \
+ (((FETCH_FUNC_NAME(method, u8) == fn) || \
+ (FETCH_FUNC_NAME(method, u16) == fn) || \
+ (FETCH_FUNC_NAME(method, u32) == fn) || \
+ (FETCH_FUNC_NAME(method, u64) == fn) || \
+ (FETCH_FUNC_NAME(method, string) == fn) || \
+ (FETCH_FUNC_NAME(method, string_size) == fn)) \
+ && (fn != NULL))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type) \
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_register(regs, \
+ (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+#define DEFINE_FETCH_stack(type) \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
+ (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+
+#define DEFINE_FETCH_retval(type) \
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+ void *dummy, void *dest) \
+{ \
+ *(type *)dest = (type)regs_return_value(regs); \
+}
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+#define DEFINE_FETCH_memory(type) \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+ void *addr, void *dest) \
+{ \
+ type retval; \
+ if (probe_kernel_address(addr, retval)) \
+ *(type *)dest = 0; \
+ else \
+ *(type *)dest = retval; \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ long ret;
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ u8 *src = addr;
+ mm_segment_t old_fs = get_fs();
+ if (!maxlen)
+ return;
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ do
+ ret = __copy_from_user_inatomic(dst++, src++, 1);
+ while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+ dst[-1] = '\0';
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) { /* Failed to fetch string */
+ ((u8 *)get_rloc_data(dest))[0] = '\0';
+ *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+ } else
+ *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+ get_rloc_offs(*(u32 *)dest));
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ int ret, len = 0;
+ u8 c;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ do {
+ ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) /* Failed to check the length */
+ *(u32 *)dest = 0;
+ else
+ *(u32 *)dest = len;
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+ char *symbol;
+ long offset;
+ unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+ sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+ if (sc->addr)
+ sc->addr += sc->offset;
+ return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+ kfree(sc->symbol);
+ kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+ struct symbol_cache *sc;
+
+ if (!sym || strlen(sym) == 0)
+ return NULL;
+ sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+ if (!sc)
+ return NULL;
+
+ sc->symbol = kstrdup(sym, GFP_KERNEL);
+ if (!sc->symbol) {
+ kfree(sc);
+ return NULL;
+ }
+ sc->offset = offset;
+
+ update_symbol_cache(sc);
+ return sc;
+}
+
+#define DEFINE_FETCH_symbol(type) \
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct symbol_cache *sc = data; \
+ if (sc->addr) \
+ fetch_memory_##type(regs, (void *)sc->addr, dest); \
+ else \
+ *(type *)dest = 0; \
+}
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
+
+/* Dereference memory access function */
+struct deref_fetch_param {
+ struct fetch_param orig;
+ long offset;
+};
+
+#define DEFINE_FETCH_deref(type) \
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct deref_fetch_param *dprm = data; \
+ unsigned long addr; \
+ call_fetch(&dprm->orig, regs, &addr); \
+ if (addr) { \
+ addr += dprm->offset; \
+ fetch_memory_##type(regs, (void *)addr, dest); \
+ } else \
+ *(type *)dest = 0; \
+}
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
+
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
+
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+/* Fetch types */
+enum {
+ FETCH_MTD_reg = 0,
+ FETCH_MTD_stack,
+ FETCH_MTD_retval,
+ FETCH_MTD_memory,
+ FETCH_MTD_symbol,
+ FETCH_MTD_deref,
+ FETCH_MTD_END,
+};
+
+#define ASSIGN_FETCH_FUNC(method, type) \
+ [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+ {.name = _name, \
+ .size = _size, \
+ .is_signed = sign, \
+ .print = PRINT_TYPE_FUNC_NAME(ptype), \
+ .fmt = PRINT_TYPE_FMT_NAME(ptype), \
+ .fmttype = _fmttype, \
+ .fetch = { \
+ASSIGN_FETCH_FUNC(reg, ftype), \
+ASSIGN_FETCH_FUNC(stack, ftype), \
+ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(memory, ftype), \
+ASSIGN_FETCH_FUNC(symbol, ftype), \
+ASSIGN_FETCH_FUNC(deref, ftype), \
+ } \
+ }
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
+/* Fetch type information table */
+static const struct fetch_type {
+ const char *name; /* Name of type */
+ size_t size; /* Byte size of type */
+ int is_signed; /* Signed flag */
+ print_type_func_t print; /* Print functions */
+ const char *fmt; /* Fromat string */
+ const char *fmttype; /* Name in format file */
+ /* Fetch functions */
+ fetch_func_t fetch[FETCH_MTD_END];
+} fetch_type_table[] = {
+ /* Special types */
+ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+ sizeof(u32), 1, "__data_loc char[]"),
+ [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+ string_size, sizeof(u32), 0, "u32"),
+ /* Basic types */
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+ int i;
+
+ if (!type)
+ type = DEFAULT_FETCH_TYPE_STR;
+
+ for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+ if (strcmp(type, fetch_type_table[i].name) == 0)
+ return &fetch_type_table[i];
+ return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+ void *dummy, void *dest)
+{
+ *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+ fetch_func_t orig_fn)
+{
+ int i;
+
+ if (type != &fetch_type_table[FETCH_TYPE_STRING])
+ return NULL; /* Only string type needs size function */
+ for (i = 0; i < FETCH_MTD_END; i++)
+ if (type->fetch[i] == orig_fn)
+ return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+
+ WARN_ON(1); /* This should not happen */
+ return NULL;
+}
+
+/**
+ * Kprobe event core functions
+ */
+
+struct probe_arg {
+ struct fetch_param fetch;
+ struct fetch_param fetch_size;
+ unsigned int offset; /* Offset from argument entry */
+ const char *name; /* Name of this argument */
+ const char *comm; /* Command of this argument */
+ const struct fetch_type *type; /* Type of this argument */
+};
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE 1
+#define TP_FLAG_PROFILE 2
+
+struct trace_probe {
+ struct list_head list;
+ struct kretprobe rp; /* Use rp.kp for kprobe use */
+ unsigned long nhit;
+ unsigned int flags; /* For TP_FLAG_* */
+ const char *symbol; /* symbol name */
+ struct ftrace_event_class class;
+ struct ftrace_event_call call;
+ ssize_t size; /* trace entry size */
+ unsigned int nr_args;
+ struct probe_arg args[];
+};
+
+#define SIZEOF_TRACE_PROBE(n) \
+ (offsetof(struct trace_probe, args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+ return tp->rp.handler != NULL;
+}
+
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+ return tp->symbol ? tp->symbol : "unknown";
+}
+
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
+/* Check the name is good for event/group/fields */
+static int is_good_name(const char *name)
+{
+ if (!isalpha(*name) && *name != '_')
+ return 0;
+ while (*++name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *group,
+ const char *event,
+ void *addr,
+ const char *symbol,
+ unsigned long offs,
+ int nargs, int is_return)
+{
+ struct trace_probe *tp;
+ int ret = -ENOMEM;
+
+ tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(ret);
+
+ if (symbol) {
+ tp->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tp->symbol)
+ goto error;
+ tp->rp.kp.symbol_name = tp->symbol;
+ tp->rp.kp.offset = offs;
+ } else
+ tp->rp.kp.addr = addr;
+
+ if (is_return)
+ tp->rp.handler = kretprobe_dispatcher;
+ else
+ tp->rp.kp.pre_handler = kprobe_dispatcher;
+
+ if (!event || !is_good_name(event)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ tp->call.class = &tp->class;
+ tp->call.name = kstrdup(event, GFP_KERNEL);
+ if (!tp->call.name)
+ goto error;
+
+ if (!group || !is_good_name(group)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ tp->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->class.system)
+ goto error;
+
+ INIT_LIST_HEAD(&tp->list);
+ return tp;
+error:
+ kfree(tp->call.name);
+ kfree(tp->symbol);
+ kfree(tp);
+ return ERR_PTR(ret);
+}
+
+static void free_probe_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ free_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ free_symbol_cache(arg->fetch.data);
+ kfree(arg->name);
+ kfree(arg->comm);
+}
+
+static void free_trace_probe(struct trace_probe *tp)
+{
+ int i;
+
+ for (i = 0; i < tp->nr_args; i++)
+ free_probe_arg(&tp->args[i]);
+
+ kfree(tp->call.class->system);
+ kfree(tp->call.name);
+ kfree(tp->symbol);
+ kfree(tp);
+}
+
+static struct trace_probe *find_probe_event(const char *event,
+ const char *group)
+{
+ struct trace_probe *tp;
+
+ list_for_each_entry(tp, &probe_list, list)
+ if (strcmp(tp->call.name, event) == 0 &&
+ strcmp(tp->call.class->system, group) == 0)
+ return tp;
+ return NULL;
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+ if (probe_is_return(tp))
+ unregister_kretprobe(&tp->rp);
+ else
+ unregister_kprobe(&tp->rp.kp);
+ list_del(&tp->list);
+ unregister_probe_event(tp);
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+ struct trace_probe *old_tp;
+ int ret;
+
+ mutex_lock(&probe_lock);
+
+ /* register as an event */
+ old_tp = find_probe_event(tp->call.name, tp->call.class->system);
+ if (old_tp) {
+ /* delete old event */
+ unregister_trace_probe(old_tp);
+ free_trace_probe(old_tp);
+ }
+ ret = register_probe_event(tp);
+ if (ret) {
+ pr_warning("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+ if (probe_is_return(tp))
+ ret = register_kretprobe(&tp->rp);
+ else
+ ret = register_kprobe(&tp->rp.kp);
+
+ if (ret) {
+ pr_warning("Could not insert probe(%d)\n", ret);
+ if (ret == -EILSEQ) {
+ pr_warning("Probing address(0x%p) is not an "
+ "instruction boundary.\n",
+ tp->rp.kp.addr);
+ ret = -EINVAL;
+ }
+ unregister_probe_event(tp);
+ } else
+ list_add_tail(&tp->list, &probe_list);
+end:
+ mutex_unlock(&probe_lock);
+ return ret;
+}
+
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+ char *tmp;
+ int ret;
+
+ if (!offset)
+ return -EINVAL;
+
+ tmp = strchr(symbol, '+');
+ if (tmp) {
+ /* skip sign because strict_strtol doesn't accept '+' */
+ ret = strict_strtoul(tmp + 1, 0, offset);
+ if (ret)
+ return ret;
+ *tmp = '\0';
+ } else
+ *offset = 0;
+ return 0;
+}
+
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, int is_return)
+{
+ int ret = 0;
+ unsigned long param;
+
+ if (strcmp(arg, "retval") == 0) {
+ if (is_return)
+ f->fn = t->fetch[FETCH_MTD_retval];
+ else
+ ret = -EINVAL;
+ } else if (strncmp(arg, "stack", 5) == 0) {
+ if (arg[5] == '\0') {
+ if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+ f->fn = fetch_stack_address;
+ else
+ ret = -EINVAL;
+ } else if (isdigit(arg[5])) {
+ ret = strict_strtoul(arg + 5, 10, &param);
+ if (ret || param > PARAM_MAX_STACK)
+ ret = -EINVAL;
+ else {
+ f->fn = t->fetch[FETCH_MTD_stack];
+ f->data = (void *)param;
+ }
+ } else
+ ret = -EINVAL;
+ } else
+ ret = -EINVAL;
+ return ret;
+}
+
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, int is_return)
+{
+ int ret = 0;
+ unsigned long param;
+ long offset;
+ char *tmp;
+
+ switch (arg[0]) {
+ case '$':
+ ret = parse_probe_vars(arg + 1, t, f, is_return);
+ break;
+ case '%': /* named register */
+ ret = regs_query_register_offset(arg + 1);
+ if (ret >= 0) {
+ f->fn = t->fetch[FETCH_MTD_reg];
+ f->data = (void *)(unsigned long)ret;
+ ret = 0;
+ }
+ break;
+ case '@': /* memory or symbol */
+ if (isdigit(arg[1])) {
+ ret = strict_strtoul(arg + 1, 0, &param);
+ if (ret)
+ break;
+ f->fn = t->fetch[FETCH_MTD_memory];
+ f->data = (void *)param;
+ } else {
+ ret = split_symbol_offset(arg + 1, &offset);
+ if (ret)
+ break;
+ f->data = alloc_symbol_cache(arg + 1, offset);
+ if (f->data)
+ f->fn = t->fetch[FETCH_MTD_symbol];
+ }
+ break;
+ case '+': /* deref memory */
+ case '-':
+ tmp = strchr(arg, '(');
+ if (!tmp)
+ break;
+ *tmp = '\0';
+ ret = strict_strtol(arg + 1, 0, &offset);
+ if (ret)
+ break;
+ if (arg[0] == '-')
+ offset = -offset;
+ arg = tmp + 1;
+ tmp = strrchr(arg, ')');
+ if (tmp) {
+ struct deref_fetch_param *dprm;
+ const struct fetch_type *t2 = find_fetch_type(NULL);
+ *tmp = '\0';
+ dprm = kzalloc(sizeof(struct deref_fetch_param),
+ GFP_KERNEL);
+ if (!dprm)
+ return -ENOMEM;
+ dprm->offset = offset;
+ ret = __parse_probe_arg(arg, t2, &dprm->orig,
+ is_return);
+ if (ret)
+ kfree(dprm);
+ else {
+ f->fn = t->fetch[FETCH_MTD_deref];
+ f->data = (void *)dprm;
+ }
+ }
+ break;
+ }
+ if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
+ pr_info("%s type has no corresponding fetch method.\n",
+ t->name);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+ struct probe_arg *parg, int is_return)
+{
+ const char *t;
+ int ret;
+
+ if (strlen(arg) > MAX_ARGSTR_LEN) {
+ pr_info("Argument is too long.: %s\n", arg);
+ return -ENOSPC;
+ }
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ pr_info("Failed to allocate memory for command '%s'.\n", arg);
+ return -ENOMEM;
+ }
+ t = strchr(parg->comm, ':');
+ if (t) {
+ arg[t - parg->comm] = '\0';
+ t++;
+ }
+ parg->type = find_fetch_type(t);
+ if (!parg->type) {
+ pr_info("Unsupported type: %s\n", t);
+ return -EINVAL;
+ }
+ parg->offset = tp->size;
+ tp->size += parg->type->size;
+ ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+ if (ret >= 0) {
+ parg->fetch_size.fn = get_fetch_size_function(parg->type,
+ parg->fetch.fn);
+ parg->fetch_size.data = parg->fetch.data;
+ }
+ return ret;
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+ struct probe_arg *args, int narg)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+ if (strcmp(reserved_field_names[i], name) == 0)
+ return 1;
+ for (i = 0; i < narg; i++)
+ if (strcmp(args[i].name, name) == 0)
+ return 1;
+ return 0;
+}
+
+static int create_trace_probe(int argc, char **argv)
+{
+ /*
+ * Argument syntax:
+ * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+ * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+ * Fetch args:
+ * $retval : fetch return value
+ * $stack : fetch stack address
+ * $stackN : fetch Nth of stack (N:0-)
+ * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
+ * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+ * %REG : fetch register REG
+ * Dereferencing memory fetch:
+ * +|-offs(ARG) : fetch memory at ARG +|- offs address.
+ * Alias name of args:
+ * NAME=FETCHARG : set NAME as alias of FETCHARG.
+ * Type of args:
+ * FETCHARG:TYPE : use TYPE instead of unsigned long.
+ */
+ struct trace_probe *tp;
+ int i, ret = 0;
+ int is_return = 0, is_delete = 0;
+ char *symbol = NULL, *event = NULL, *group = NULL;
+ char *arg;
+ unsigned long offset = 0;
+ void *addr = NULL;
+ char buf[MAX_EVENT_NAME_LEN];
+
+ /* argc must be >= 1 */
+ if (argv[0][0] == 'p')
+ is_return = 0;
+ else if (argv[0][0] == 'r')
+ is_return = 1;
+ else if (argv[0][0] == '-')
+ is_delete = 1;
+ else {
+ pr_info("Probe definition must be started with 'p', 'r' or"
+ " '-'.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ if (strchr(event, '/')) {
+ group = event;
+ event = strchr(group, '/') + 1;
+ event[-1] = '\0';
+ if (strlen(group) == 0) {
+ pr_info("Group name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (!group)
+ group = KPROBE_EVENT_SYSTEM;
+
+ if (is_delete) {
+ if (!event) {
+ pr_info("Delete command needs an event name.\n");
+ return -EINVAL;
+ }
+ mutex_lock(&probe_lock);
+ tp = find_probe_event(event, group);
+ if (!tp) {
+ mutex_unlock(&probe_lock);
+ pr_info("Event %s/%s doesn't exist.\n", group, event);
+ return -ENOENT;
+ }
+ /* delete an event */
+ unregister_trace_probe(tp);
+ free_trace_probe(tp);
+ mutex_unlock(&probe_lock);
+ return 0;
+ }
+
+ if (argc < 2) {
+ pr_info("Probe point is not specified.\n");
+ return -EINVAL;
+ }
+ if (isdigit(argv[1][0])) {
+ if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
+ }
+ /* an address specified */
+ ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
+ if (ret) {
+ pr_info("Failed to parse address.\n");
+ return ret;
+ }
+ } else {
+ /* a symbol specified */
+ symbol = argv[1];
+ /* TODO: support .init module functions */
+ ret = split_symbol_offset(symbol, &offset);
+ if (ret) {
+ pr_info("Failed to parse symbol.\n");
+ return ret;
+ }
+ if (offset && is_return) {
+ pr_info("Return probe must be used without offset.\n");
+ return -EINVAL;
+ }
+ }
+ argc -= 2; argv += 2;
+
+ /* setup a probe */
+ if (!event) {
+ /* Make a new event name */
+ if (symbol)
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
+ is_return ? 'r' : 'p', symbol, offset);
+ else
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
+ is_return ? 'r' : 'p', addr);
+ event = buf;
+ }
+ tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+ is_return);
+ if (IS_ERR(tp)) {
+ pr_info("Failed to allocate trace_probe.(%d)\n",
+ (int)PTR_ERR(tp));
+ return PTR_ERR(tp);
+ }
+
+ /* parse arguments */
+ ret = 0;
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ /* Increment count for freeing args in error case */
+ tp->nr_args++;
+
+ /* Parse argument name */
+ arg = strchr(argv[i], '=');
+ if (arg) {
+ *arg++ = '\0';
+ tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+ } else {
+ arg = argv[i];
+ /* If argument name is omitted, set "argN" */
+ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+ tp->args[i].name = kstrdup(buf, GFP_KERNEL);
+ }
+
+ if (!tp->args[i].name) {
+ pr_info("Failed to allocate argument[%d] name.\n", i);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (!is_good_name(tp->args[i].name)) {
+ pr_info("Invalid argument[%d] name: %s\n",
+ i, tp->args[i].name);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+ pr_info("Argument[%d] name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Parse fetch argument */
+ ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
+ if (ret) {
+ pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ goto error;
+ }
+ }
+
+ ret = register_trace_probe(tp);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_probe(tp);
+ return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+ struct trace_probe *tp;
+
+ mutex_lock(&probe_lock);
+ /* TODO: Use batch unregistration */
+ while (!list_empty(&probe_list)) {
+ tp = list_entry(probe_list.next, struct trace_probe, list);
+ unregister_trace_probe(tp);
+ free_trace_probe(tp);
+ }
+ mutex_unlock(&probe_lock);
+}
+
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&probe_lock);
+ return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_probe *tp = v;
+ int i;
+
+ seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+ seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
+
+ if (!tp->symbol)
+ seq_printf(m, " 0x%p", tp->rp.kp.addr);
+ else if (tp->rp.kp.offset)
+ seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+ else
+ seq_printf(m, " %s", probe_symbol(tp));
+
+ for (i = 0; i < tp->nr_args; i++)
+ seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
+ seq_printf(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ cleanup_all_probes();
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static int command_trace_probe(const char *buf)
+{
+ char **argv;
+ int argc = 0, ret = 0;
+
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = create_trace_probe(argc, argv);
+
+ argv_free(argv);
+ return ret;
+}
+
+#define WRITE_BUFSIZE 128
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char *kbuf, *tmp;
+ int ret;
+ size_t done;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ ret = done = 0;
+ while (done < count) {
+ size = count - done;
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ tmp = strchr(kbuf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - kbuf + 1;
+ } else if (done + size < count) {
+ pr_warning("Line length is too long: "
+ "Should be less than %d.", WRITE_BUFSIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+ done += size;
+ /* Remove comments */
+ tmp = strchr(kbuf, '#');
+ if (tmp)
+ *tmp = '\0';
+
+ ret = command_trace_probe(kbuf);
+ if (ret)
+ goto out;
+ }
+ ret = done;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static const struct file_operations kprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_probe *tp = v;
+
+ seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+ tp->rp.kp.nmissed);
+
+ return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+ .owner = THIS_MODULE,
+ .open = profile_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* Sum up total data length for dynamic arraies (strings) */
+static __kprobes int __get_data_size(struct trace_probe *tp,
+ struct pt_regs *regs)
+{
+ int i, ret = 0;
+ u32 len;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ call_fetch(&tp->args[i].fetch_size, regs, &len);
+ ret += len;
+ }
+
+ return ret;
+}
+
+/* Store the value of each argument */
+static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp,
+ struct pt_regs *regs,
+ u8 *data, int maxlen)
+{
+ int i;
+ u32 end = tp->size;
+ u32 *dl; /* Data (relative) location */
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ /*
+ * First, we set the relative location and
+ * maximum data length to *dl
+ */
+ dl = (u32 *)(data + tp->args[i].offset);
+ *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+ /* Then try to fetch string or dynamic array data */
+ call_fetch(&tp->args[i].fetch, regs, dl);
+ /* Reduce maximum length */
+ end += get_rloc_len(*dl);
+ maxlen -= get_rloc_len(*dl);
+ /* Trick here, convert data_rloc to data_loc */
+ *dl = convert_rloc_to_loc(*dl,
+ ent_size + tp->args[i].offset);
+ } else
+ /* Just fetching data normally */
+ call_fetch(&tp->args[i].fetch, regs,
+ data + tp->args[i].offset);
+ }
+}
+
+/* Kprobe handler */
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+ struct kprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size, dsize, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tp->call;
+
+ tp->nhit++;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ dsize = __get_data_size(tp, regs);
+ size = sizeof(*entry) + tp->size + dsize;
+
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = (unsigned long)kp->addr;
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+}
+
+/* Kretprobe handler */
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+ struct kretprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size, pc, dsize;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tp->call;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ dsize = __get_data_size(tp, regs);
+ size = sizeof(*entry) + tp->size + dsize;
+
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->func = (unsigned long)tp->rp.kp.addr;
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+}
+
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct kprobe_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+ u8 *data;
+ int i;
+
+ field = (struct kprobe_trace_entry_head *)iter->ent;
+ tp = container_of(event, struct trace_probe, call.event);
+
+ if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ")"))
+ goto partial;
+
+ data = (u8 *)&field[1];
+ for (i = 0; i < tp->nr_args; i++)
+ if (!tp->args[i].type->print(s, tp->args[i].name,
+ data + tp->args[i].offset, field))
+ goto partial;
+
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct kretprobe_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+ u8 *data;
+ int i;
+
+ field = (struct kretprobe_trace_entry_head *)iter->ent;
+ tp = container_of(event, struct trace_probe, call.event);
+
+ if (!trace_seq_printf(s, "%s: (", tp->call.name))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, " <- "))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ goto partial;
+
+ if (!trace_seq_puts(s, ")"))
+ goto partial;
+
+ data = (u8 *)&field[1];
+ for (i = 0; i < tp->nr_args; i++)
+ if (!tp->args[i].type->print(s, tp->args[i].name,
+ data + tp->args[i].offset, field))
+ goto partial;
+
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags |= TP_FLAG_TRACE;
+ if (probe_is_return(tp))
+ return enable_kretprobe(&tp->rp);
+ else
+ return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags &= ~TP_FLAG_TRACE;
+ if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+ if (probe_is_return(tp))
+ disable_kretprobe(&tp->rp);
+ else
+ disable_kprobe(&tp->rp.kp);
+ }
+}
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed) \
+ do { \
+ ret = trace_define_field(event_call, #type, name, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed, \
+ FILTER_OTHER); \
+ if (ret) \
+ return ret; \
+ } while (0)
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kprobe_trace_entry_head field;
+ struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+ DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+ /* Set argument names as fields */
+ for (i = 0; i < tp->nr_args; i++) {
+ ret = trace_define_field(event_call, tp->args[i].type->fmttype,
+ tp->args[i].name,
+ sizeof(field) + tp->args[i].offset,
+ tp->args[i].type->size,
+ tp->args[i].type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kretprobe_trace_entry_head field;
+ struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+ DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+ /* Set argument names as fields */
+ for (i = 0; i < tp->nr_args; i++) {
+ ret = trace_define_field(event_call, tp->args[i].type->fmttype,
+ tp->args[i].name,
+ sizeof(field) + tp->args[i].offset,
+ tp->args[i].type->size,
+ tp->args[i].type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
+{
+ int i;
+ int pos = 0;
+
+ const char *fmt, *arg;
+
+ if (!probe_is_return(tp)) {
+ fmt = "(%lx)";
+ arg = "REC->" FIELD_STRING_IP;
+ } else {
+ fmt = "(%lx <- %lx)";
+ arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ }
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+
+ for (i = 0; i < tp->nr_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+ tp->args[i].name, tp->args[i].type->fmt);
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (strcmp(tp->args[i].type->name, "string") == 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_str(%s)",
+ tp->args[i].name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+ tp->args[i].name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+static int set_print_fmt(struct trace_probe *tp)
+{
+ int len;
+ char *print_fmt;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_print_fmt(tp, NULL, 0);
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_print_fmt(tp, print_fmt, len + 1);
+ tp->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+#ifdef CONFIG_PERF_EVENTS
+
+/* Kprobe profile handler */
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+ struct ftrace_event_call *call = &tp->call;
+ struct kprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ dsize = __get_data_size(tp, regs);
+ __size = sizeof(*entry) + tp->size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "profile buffer not large enough"))
+ return;
+
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ if (!entry)
+ return;
+
+ entry->ip = (unsigned long)kp->addr;
+ memset(&entry[1], 0, dsize);
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+}
+
+/* Kretprobe profile handler */
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+ struct ftrace_event_call *call = &tp->call;
+ struct kretprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ dsize = __get_data_size(tp, regs);
+ __size = sizeof(*entry) + tp->size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "profile buffer not large enough"))
+ return;
+
+ entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ if (!entry)
+ return;
+
+ entry->func = (unsigned long)tp->rp.kp.addr;
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+
+ head = this_cpu_ptr(call->perf_events);
+ perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+}
+
+static int probe_perf_enable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags |= TP_FLAG_PROFILE;
+
+ if (probe_is_return(tp))
+ return enable_kretprobe(&tp->rp);
+ else
+ return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_perf_disable(struct ftrace_event_call *call)
+{
+ struct trace_probe *tp = (struct trace_probe *)call->data;
+
+ tp->flags &= ~TP_FLAG_PROFILE;
+
+ if (!(tp->flags & TP_FLAG_TRACE)) {
+ if (probe_is_return(tp))
+ disable_kretprobe(&tp->rp);
+ else
+ disable_kprobe(&tp->rp.kp);
+ }
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(event);
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_perf_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_perf_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+
+ if (tp->flags & TP_FLAG_TRACE)
+ kprobe_trace_func(kp, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (tp->flags & TP_FLAG_PROFILE)
+ kprobe_perf_func(kp, regs);
+#endif
+ return 0; /* We don't tweek kernel, so just return 0 */
+}
+
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+
+ if (tp->flags & TP_FLAG_TRACE)
+ kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (tp->flags & TP_FLAG_PROFILE)
+ kretprobe_perf_func(ri, regs);
+#endif
+ return 0; /* We don't tweek kernel, so just return 0 */
+}
+
+static struct trace_event_functions kretprobe_funcs = {
+ .trace = print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+ .trace = print_kprobe_event
+};
+
+static int register_probe_event(struct trace_probe *tp)
+{
+ struct ftrace_event_call *call = &tp->call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ INIT_LIST_HEAD(&call->class->fields);
+ if (probe_is_return(tp)) {
+ call->event.funcs = &kretprobe_funcs;
+ call->class->define_fields = kretprobe_event_define_fields;
+ } else {
+ call->event.funcs = &kprobe_funcs;
+ call->class->define_fields = kprobe_event_define_fields;
+ }
+ if (set_print_fmt(tp) < 0)
+ return -ENOMEM;
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
+ kfree(call->print_fmt);
+ return -ENODEV;
+ }
+ call->flags = 0;
+ call->class->reg = kprobe_register;
+ call->data = tp;
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_info("Failed to register kprobe event: %s\n", call->name);
+ kfree(call->print_fmt);
+ unregister_ftrace_event(&call->event);
+ }
+ return ret;
+}
+
+static void unregister_probe_event(struct trace_probe *tp)
+{
+ /* tp->event is unregistered in trace_remove_event_call() */
+ trace_remove_event_call(&tp->call);
+ kfree(tp->call.print_fmt);
+}
+
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+ NULL, &kprobe_events_ops);
+
+ /* Event list interface */
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'kprobe_events' entry\n");
+
+ /* Profile interface */
+ entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+ NULL, &kprobe_profile_ops);
+
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'kprobe_profile' entry\n");
+ return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+ int a4, int a5, int a6)
+{
+ return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static __init int kprobe_trace_self_tests_init(void)
+{
+ int ret, warn = 0;
+ int (*target)(int, int, int, int, int, int);
+ struct trace_probe *tp;
+
+ target = kprobe_trace_selftest_target;
+
+ pr_info("Testing kprobe tracing: ");
+
+ ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)");
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error on probing function entry.\n");
+ warn++;
+ } else {
+ /* Enable trace point */
+ tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tp == NULL)) {
+ pr_warning("error on getting new probe.\n");
+ warn++;
+ } else
+ probe_event_enable(&tp->call);
+ }
+
+ ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval");
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error on probing function return.\n");
+ warn++;
+ } else {
+ /* Enable trace point */
+ tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tp == NULL)) {
+ pr_warning("error on getting new probe.\n");
+ warn++;
+ } else
+ probe_event_enable(&tp->call);
+ }
+
+ if (warn)
+ goto end;
+
+ ret = target(1, 2, 3, 4, 5, 6);
+
+ ret = command_trace_probe("-:testprobe");
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error on deleting a probe.\n");
+ warn++;
+ }
+
+ ret = command_trace_probe("-:testprobe2");
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error on deleting a probe.\n");
+ warn++;
+ }
+
+end:
+ cleanup_all_probes();
+ if (warn)
+ pr_cont("NG: Some tests are failed. Please check them.\n");
+ else
+ pr_cont("OK\n");
+ return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed080..017fa376505 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
+#include <linux/slab.h>
#include <linux/time.h>
#include <asm/atomic.h>
@@ -307,11 +308,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
+ struct ftrace_event_call *call = &event_mmiotrace_rw;
+ struct ring_buffer *buffer = tr->buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
int pc = preempt_count();
- event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
+ event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
sizeof(*entry), 0, pc);
if (!event) {
atomic_inc(&dropped_count);
@@ -319,7 +322,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
}
entry = ring_buffer_event_data(event);
entry->rw = *rw;
- trace_buffer_unlock_commit(tr, event, 0, pc);
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +338,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
+ struct ftrace_event_call *call = &event_mmiotrace_map;
+ struct ring_buffer *buffer = tr->buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
int pc = preempt_count();
- event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
+ event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
sizeof(*entry), 0, pc);
if (!event) {
atomic_inc(&dropped_count);
@@ -345,7 +352,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
}
entry = ring_buffer_event_data(event);
entry->map = *map;
- trace_buffer_unlock_commit(tr, event, 0, pc);
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e..02272baa220 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -16,21 +16,25 @@
DECLARE_RWSEM(trace_event_mutex);
-DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
-EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
-
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
-void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
{
int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+ int ret;
- s->buffer[len] = 0;
- seq_puts(m, s->buffer);
+ ret = seq_write(m, s->buffer, len);
- trace_seq_init(s);
+ /*
+ * Only reset this buffer if we successfully wrote to the
+ * seq_file buffer.
+ */
+ if (!ret)
+ trace_seq_init(s);
+
+ return ret;
}
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -70,6 +74,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
* @s: trace sequence descriptor
* @fmt: printf format string
*
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
* The tracer may use either sequence operations or its own
* copy to user routines. To simplify formating of a trace
* trace_seq_printf is used to store strings into a special
@@ -83,7 +90,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
va_list ap;
int ret;
- if (!len)
+ if (s->full || !len)
return 0;
va_start(ap, fmt);
@@ -91,12 +98,14 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
va_end(ap);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len)
+ if (ret >= len) {
+ s->full = 1;
return 0;
+ }
s->len += ret;
- return len;
+ return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_printf);
@@ -117,14 +126,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
int len = (PAGE_SIZE - 1) - s->len;
int ret;
- if (!len)
+ if (s->full || !len)
return 0;
ret = vsnprintf(s->buffer + s->len, len, fmt, args);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len)
+ if (ret >= len) {
+ s->full = 1;
return 0;
+ }
s->len += ret;
@@ -137,14 +148,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
int len = (PAGE_SIZE - 1) - s->len;
int ret;
- if (!len)
+ if (s->full || !len)
return 0;
ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len)
+ if (ret >= len) {
+ s->full = 1;
return 0;
+ }
s->len += ret;
@@ -165,9 +178,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
{
int len = strlen(str);
- if (len > ((PAGE_SIZE - 1) - s->len))
+ if (s->full)
return 0;
+ if (len > ((PAGE_SIZE - 1) - s->len)) {
+ s->full = 1;
+ return 0;
+ }
+
memcpy(s->buffer + s->len, str, len);
s->len += len;
@@ -176,19 +194,30 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
int trace_seq_putc(struct trace_seq *s, unsigned char c)
{
- if (s->len >= (PAGE_SIZE - 1))
+ if (s->full)
+ return 0;
+
+ if (s->len >= (PAGE_SIZE - 1)) {
+ s->full = 1;
return 0;
+ }
s->buffer[s->len++] = c;
return 1;
}
+EXPORT_SYMBOL(trace_seq_putc);
int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
{
- if (len > ((PAGE_SIZE - 1) - s->len))
+ if (s->full)
return 0;
+ if (len > ((PAGE_SIZE - 1) - s->len)) {
+ s->full = 1;
+ return 0;
+ }
+
memcpy(s->buffer + s->len, mem, len);
s->len += len;
@@ -201,6 +230,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
const unsigned char *data = mem;
int i, j;
+ if (s->full)
+ return 0;
+
#ifdef __BIG_ENDIAN
for (i = 0, j = 0; i < len; i++) {
#else
@@ -218,9 +250,14 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
{
void *ret;
- if (len > ((PAGE_SIZE - 1) - s->len))
+ if (s->full)
return NULL;
+ if (len > ((PAGE_SIZE - 1) - s->len)) {
+ s->full = 1;
+ return NULL;
+ }
+
ret = s->buffer + s->len;
s->len += len;
@@ -231,8 +268,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
{
unsigned char *p;
- if (s->len >= (PAGE_SIZE - 1))
+ if (s->full)
+ return 0;
+
+ if (s->len >= (PAGE_SIZE - 1)) {
+ s->full = 1;
return 0;
+ }
+
p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
if (!IS_ERR(p)) {
p = mangle_path(s->buffer + s->len, p, "\n");
@@ -245,6 +288,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
return 1;
}
+ s->full = 1;
return 0;
}
@@ -309,6 +353,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
}
EXPORT_SYMBOL(ftrace_print_symbols_seq);
+const char *
+ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+{
+ int i;
+ const char *ret = p->buffer + p->len;
+
+ for (i = 0; i < buf_len; i++)
+ trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_hex_seq);
+
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
{
@@ -371,6 +430,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
unsigned long vmstart = 0;
int ret = 1;
+ if (s->full)
+ return 0;
+
if (mm) {
const struct vm_area_struct *vma;
@@ -408,7 +470,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
* since individual threads might have already quit!
*/
rcu_read_lock();
- task = find_task_by_vpid(entry->ent.tgid);
+ task = find_task_by_vpid(entry->tgid);
if (task)
mm = get_task_mm(task);
rcu_read_unlock();
@@ -461,18 +523,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
return ret;
}
-static int
-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+/**
+ * trace_print_lat_fmt - print the irq, preempt and lockdep fields
+ * @s: trace seq struct to write to
+ * @entry: The trace entry field from the ring buffer
+ *
+ * Prints the generic fields of irqs off, in hard or softirq, preempt
+ * count and lock depth.
+ */
+int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
int hardirq, softirq;
- char comm[TASK_COMM_LEN];
+ int ret;
- trace_find_cmdline(entry->pid, comm);
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
- if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
- comm, entry->pid, cpu,
+ if (!trace_seq_printf(s, "%c%c%c",
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
'X' : '.',
@@ -483,8 +550,31 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
return 0;
if (entry->preempt_count)
- return trace_seq_printf(s, "%x", entry->preempt_count);
- return trace_seq_puts(s, ".");
+ ret = trace_seq_printf(s, "%x", entry->preempt_count);
+ else
+ ret = trace_seq_putc(s, '.');
+
+ if (!ret)
+ return 0;
+
+ if (entry->lock_depth < 0)
+ return trace_seq_putc(s, '.');
+
+ return trace_seq_printf(s, "%d", entry->lock_depth);
+}
+
+static int
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+
+ if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
+ comm, entry->pid, cpu))
+ return 0;
+
+ return trace_print_lat_fmt(s, entry);
}
static unsigned long preempt_mark_thresh = 100;
@@ -649,6 +739,9 @@ int register_ftrace_event(struct trace_event *event)
if (WARN_ON(!event))
goto out;
+ if (WARN_ON(!event->funcs))
+ goto out;
+
INIT_LIST_HEAD(&event->list);
if (!event->type) {
@@ -681,14 +774,14 @@ int register_ftrace_event(struct trace_event *event)
goto out;
}
- if (event->trace == NULL)
- event->trace = trace_nop_print;
- if (event->raw == NULL)
- event->raw = trace_nop_print;
- if (event->hex == NULL)
- event->hex = trace_nop_print;
- if (event->binary == NULL)
- event->binary = trace_nop_print;
+ if (event->funcs->trace == NULL)
+ event->funcs->trace = trace_nop_print;
+ if (event->funcs->raw == NULL)
+ event->funcs->raw = trace_nop_print;
+ if (event->funcs->hex == NULL)
+ event->funcs->hex = trace_nop_print;
+ if (event->funcs->binary == NULL)
+ event->funcs->binary = trace_nop_print;
key = event->type & (EVENT_HASHSIZE - 1);
@@ -730,13 +823,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
* Standard events
*/
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return TRACE_TYPE_HANDLED;
}
/* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -763,7 +858,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
@@ -777,7 +873,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -790,7 +887,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -803,14 +901,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_fn_event = {
- .type = TRACE_FN,
+static struct trace_event_functions trace_fn_funcs = {
.trace = trace_fn_trace,
.raw = trace_fn_raw,
.hex = trace_fn_hex,
.binary = trace_fn_bin,
};
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .funcs = &trace_fn_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -839,13 +941,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_print(iter, "==>");
}
static enum print_line_t trace_wake_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return trace_ctxwake_print(iter, " +");
}
@@ -858,7 +961,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- task_state_char(field->prev_state);
+ S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
@@ -873,12 +976,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, 0);
}
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, '+');
}
@@ -893,7 +998,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- task_state_char(field->prev_state);
+ S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
@@ -907,18 +1012,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, 0);
}
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, '+');
}
static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct ctx_switch_entry *field;
struct trace_seq *s = &iter->seq;
@@ -935,81 +1042,34 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_ctx_event = {
- .type = TRACE_CTX,
+static struct trace_event_functions trace_ctx_funcs = {
.trace = trace_ctx_print,
.raw = trace_ctx_raw,
.hex = trace_ctx_hex,
.binary = trace_ctxwake_bin,
};
-static struct trace_event trace_wake_event = {
- .type = TRACE_WAKE,
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .funcs = &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
.trace = trace_wake_print,
.raw = trace_wake_raw,
.hex = trace_wake_hex,
.binary = trace_ctxwake_bin,
};
-/* TRACE_SPECIAL */
-static enum print_line_t trace_special_print(struct trace_iterator *iter,
- int flags)
-{
- struct special_entry *field;
-
- trace_assign_type(field, iter->ent);
-
- if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_hex(struct trace_iterator *iter,
- int flags)
-{
- struct special_entry *field;
- struct trace_seq *s = &iter->seq;
-
- trace_assign_type(field, iter->ent);
-
- SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-
- return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t trace_special_bin(struct trace_iterator *iter,
- int flags)
-{
- struct special_entry *field;
- struct trace_seq *s = &iter->seq;
-
- trace_assign_type(field, iter->ent);
-
- SEQ_PUT_FIELD_RET(s, field->arg1);
- SEQ_PUT_FIELD_RET(s, field->arg2);
- SEQ_PUT_FIELD_RET(s, field->arg3);
-
- return TRACE_TYPE_HANDLED;
-}
-
-static struct trace_event trace_special_event = {
- .type = TRACE_SPECIAL,
- .trace = trace_special_print,
- .raw = trace_special_print,
- .hex = trace_special_hex,
- .binary = trace_special_bin,
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .funcs = &trace_wake_funcs,
};
/* TRACE_STACK */
static enum print_line_t trace_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct stack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1037,17 +1097,18 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_stack_funcs = {
+ .trace = trace_stack_print,
+};
+
static struct trace_event trace_stack_event = {
.type = TRACE_STACK,
- .trace = trace_stack_print,
- .raw = trace_special_print,
- .hex = trace_special_hex,
- .binary = trace_special_bin,
+ .funcs = &trace_stack_funcs,
};
/* TRACE_USER_STACK */
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct userstack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1066,17 +1127,19 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_user_stack_funcs = {
+ .trace = trace_user_stack_print,
+};
+
static struct trace_event trace_user_stack_event = {
.type = TRACE_USER_STACK,
- .trace = trace_user_stack_print,
- .raw = trace_special_print,
- .hex = trace_special_hex,
- .binary = trace_special_bin,
+ .funcs = &trace_user_stack_funcs,
};
/* TRACE_BPRINT */
static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
@@ -1101,7 +1164,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct bprint_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1120,16 +1184,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_bprint_funcs = {
+ .trace = trace_bprint_print,
+ .raw = trace_bprint_raw,
+};
static struct trace_event trace_bprint_event = {
.type = TRACE_BPRINT,
- .trace = trace_bprint_print,
- .raw = trace_bprint_raw,
+ .funcs = &trace_bprint_funcs,
};
/* TRACE_PRINT */
static enum print_line_t trace_print_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1148,7 +1215,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct print_entry *field;
@@ -1163,18 +1231,21 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_print_event = {
- .type = TRACE_PRINT,
+static struct trace_event_functions trace_print_funcs = {
.trace = trace_print_print,
.raw = trace_print_raw,
};
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .funcs = &trace_print_funcs,
+};
+
static struct trace_event *events[] __initdata = {
&trace_fn_event,
&trace_ctx_event,
&trace_wake_event,
- &trace_special_event,
&trace_stack_event,
&trace_user_stack_event,
&trace_bprint_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c3..c038eba0492 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,9 @@ extern void trace_event_read_unlock(void);
extern struct trace_event *ftrace_find_event(int type);
extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
- int flags);
+ int flags, struct trace_event *event);
+extern int
+trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
/* used by module unregistering */
extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index 8a30d9874cd..00000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * ring buffer based C-state tracer
- *
- * Arjan van de Ven <arjan@linux.intel.com>
- * Copyright (C) 2008 Intel Corporation
- *
- * Much is borrowed from trace_boot.c which is
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/power.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *power_trace;
-static int __read_mostly trace_power_enabled;
-
-static void probe_power_start(struct power_trace *it, unsigned int type,
- unsigned int level)
-{
- if (!trace_power_enabled)
- return;
-
- memset(it, 0, sizeof(struct power_trace));
- it->state = level;
- it->type = type;
- it->stamp = ktime_get();
-}
-
-
-static void probe_power_end(struct power_trace *it)
-{
- struct ftrace_event_call *call = &event_power;
- struct ring_buffer_event *event;
- struct trace_power *entry;
- struct trace_array_cpu *data;
- struct trace_array *tr = power_trace;
-
- if (!trace_power_enabled)
- return;
-
- preempt_disable();
- it->end = ktime_get();
- data = tr->data[smp_processor_id()];
-
- event = trace_buffer_lock_reserve(tr, TRACE_POWER,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->state_data = *it;
- if (!filter_check_discard(call, entry, tr->buffer, event))
- trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
- preempt_enable();
-}
-
-static void probe_power_mark(struct power_trace *it, unsigned int type,
- unsigned int level)
-{
- struct ftrace_event_call *call = &event_power;
- struct ring_buffer_event *event;
- struct trace_power *entry;
- struct trace_array_cpu *data;
- struct trace_array *tr = power_trace;
-
- if (!trace_power_enabled)
- return;
-
- memset(it, 0, sizeof(struct power_trace));
- it->state = level;
- it->type = type;
- it->stamp = ktime_get();
- preempt_disable();
- it->end = it->stamp;
- data = tr->data[smp_processor_id()];
-
- event = trace_buffer_lock_reserve(tr, TRACE_POWER,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->state_data = *it;
- if (!filter_check_discard(call, entry, tr->buffer, event))
- trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
- preempt_enable();
-}
-
-static int tracing_power_register(void)
-{
- int ret;
-
- ret = register_trace_power_start(probe_power_start);
- if (ret) {
- pr_info("power trace: Couldn't activate tracepoint"
- " probe to trace_power_start\n");
- return ret;
- }
- ret = register_trace_power_end(probe_power_end);
- if (ret) {
- pr_info("power trace: Couldn't activate tracepoint"
- " probe to trace_power_end\n");
- goto fail_start;
- }
- ret = register_trace_power_mark(probe_power_mark);
- if (ret) {
- pr_info("power trace: Couldn't activate tracepoint"
- " probe to trace_power_mark\n");
- goto fail_end;
- }
- return ret;
-fail_end:
- unregister_trace_power_end(probe_power_end);
-fail_start:
- unregister_trace_power_start(probe_power_start);
- return ret;
-}
-
-static void start_power_trace(struct trace_array *tr)
-{
- trace_power_enabled = 1;
-}
-
-static void stop_power_trace(struct trace_array *tr)
-{
- trace_power_enabled = 0;
-}
-
-static void power_trace_reset(struct trace_array *tr)
-{
- trace_power_enabled = 0;
- unregister_trace_power_start(probe_power_start);
- unregister_trace_power_end(probe_power_end);
- unregister_trace_power_mark(probe_power_mark);
-}
-
-
-static int power_trace_init(struct trace_array *tr)
-{
- int cpu;
- power_trace = tr;
-
- trace_power_enabled = 1;
- tracing_power_register();
-
- for_each_cpu(cpu, cpu_possible_mask)
- tracing_reset(tr, cpu);
- return 0;
-}
-
-static enum print_line_t power_print_line(struct trace_iterator *iter)
-{
- int ret = 0;
- struct trace_entry *entry = iter->ent;
- struct trace_power *field ;
- struct power_trace *it;
- struct trace_seq *s = &iter->seq;
- struct timespec stamp;
- struct timespec duration;
-
- trace_assign_type(field, entry);
- it = &field->state_data;
- stamp = ktime_to_timespec(it->stamp);
- duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
-
- if (entry->type == TRACE_POWER) {
- if (it->type == POWER_CSTATE)
- ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
- stamp.tv_sec,
- stamp.tv_nsec,
- it->state, iter->cpu,
- duration.tv_sec,
- duration.tv_nsec);
- if (it->type == POWER_PSTATE)
- ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
- stamp.tv_sec,
- stamp.tv_nsec,
- it->state, iter->cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
- }
- return TRACE_TYPE_UNHANDLED;
-}
-
-static void power_print_header(struct seq_file *s)
-{
- seq_puts(s, "# TIMESTAMP STATE EVENT\n");
- seq_puts(s, "# | | |\n");
-}
-
-static struct tracer power_tracer __read_mostly =
-{
- .name = "power",
- .init = power_trace_init,
- .start = start_power_trace,
- .stop = stop_power_trace,
- .reset = power_trace_reset,
- .print_line = power_print_line,
- .print_header = power_print_header,
-};
-
-static int init_power_trace(void)
-{
- return register_tracer(&power_tracer);
-}
-device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 9bece9687b6..2547d8813cf 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/module.h>
-#include <linux/marker.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
#include <linux/list.h>
@@ -155,25 +154,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
EXPORT_SYMBOL_GPL(__ftrace_vprintk);
static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+t_start(struct seq_file *m, loff_t *pos)
{
- const char **fmt = m->private;
- const char **next = fmt;
-
- (*pos)++;
+ const char **fmt = __start___trace_bprintk_fmt + *pos;
if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
return NULL;
-
- next = fmt;
- m->private = ++next;
-
return fmt;
}
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
{
- return t_next(m, NULL, pos);
+ (*pos)++;
+ return t_start(m, pos);
}
static int t_show(struct seq_file *m, void *v)
@@ -182,7 +175,7 @@ static int t_show(struct seq_file *m, void *v)
const char *str = *fmt;
int i;
- seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+ seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
/*
* Tabs and new lines need to be converted.
@@ -224,15 +217,7 @@ static const struct seq_operations show_format_seq_ops = {
static int
ftrace_formats_open(struct inode *inode, struct file *file)
{
- int ret;
-
- ret = seq_open(file, &show_format_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
-
- m->private = __start___trace_bprintk_fmt;
- }
- return ret;
+ return seq_open(file, &show_format_seq_ops);
}
static const struct file_operations ftrace_formats_fops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979..8f758d070c4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,9 +20,37 @@ static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
static int sched_stopped;
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
- struct task_struct *next)
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -49,8 +77,38 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
local_irq_restore(flags);
}
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr->buffer, flags, 6, pc);
+ ftrace_trace_userstack(tr->buffer, flags, pc);
+}
+
static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -80,21 +138,21 @@ static int tracing_sched_register(void)
{
int ret;
- ret = register_trace_sched_wakeup(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return ret;
}
- ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_sched_switch);
+ ret = register_trace_sched_switch(probe_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -103,17 +161,17 @@ static int tracing_sched_register(void)
return ret;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
return ret;
}
static void tracing_sched_unregister(void)
{
- unregister_trace_sched_switch(probe_sched_switch);
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_switch(probe_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb2722517..7319559ed59 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,66 +24,106 @@ static int __read_mostly tracer_enabled;
static struct task_struct *wakeup_task;
static int wakeup_cpu;
+static int wakeup_current_cpu;
static unsigned wakeup_prio = -1;
static int wakeup_rt;
-static raw_spinlock_t wakeup_lock =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t wakeup_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static void wakeup_reset(struct trace_array *tr);
static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
static int save_lat_flag;
+#define TRACE_DISPLAY_GRAPH 1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* display latency trace as call graph */
+ { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ .val = 0,
+ .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
#ifdef CONFIG_FUNCTION_TRACER
+
/*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ * is disabled and data->disabled is incremented.
+ * 0 if the trace is to be ignored, and preemption
+ * is not disabled and data->disabled is
+ * kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ * inside the #ifdef of the function graph tracer below.
+ * This is OK, since the function graph tracer is
+ * dependent on the function tracer.
*/
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+ struct trace_array_cpu **data,
+ int *pc)
{
- struct trace_array *tr = wakeup_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
long disabled;
- int resched;
int cpu;
- int pc;
if (likely(!wakeup_task))
- return;
+ return 0;
- pc = preempt_count();
- resched = ftrace_preempt_disable();
+ *pc = preempt_count();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ if (cpu != wakeup_current_cpu)
+ goto out_enable;
+
+ *data = tr->data[cpu];
+ disabled = atomic_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
- local_irq_save(flags);
- __raw_spin_lock(&wakeup_lock);
+ return 1;
- if (unlikely(!wakeup_task))
- goto unlock;
+out:
+ atomic_dec(&(*data)->disabled);
- /*
- * The task can't disappear because it needs to
- * wake up first, and we have the wakeup_lock.
- */
- if (task_cpu(wakeup_task) != cpu)
- goto unlock;
+out_enable:
+ preempt_enable_notrace();
+ return 0;
+}
- trace_function(tr, ip, parent_ip, flags, pc);
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
- unlock:
- __raw_spin_unlock(&wakeup_lock);
+ local_irq_save(flags);
+ trace_function(tr, ip, parent_ip, flags, pc);
local_irq_restore(flags);
- out:
atomic_dec(&data->disabled);
-
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __read_mostly =
@@ -92,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
};
#endif /* CONFIG_FUNCTION_TRACER */
+static int start_func_tracer(int graph)
+{
+ int ret;
+
+ if (!graph)
+ ret = register_ftrace_function(&trace_ops);
+ else
+ ret = register_ftrace_graph(&wakeup_graph_return,
+ &wakeup_graph_entry);
+
+ if (!ret && tracing_is_enabled())
+ tracer_enabled = 1;
+ else
+ tracer_enabled = 0;
+
+ return ret;
+}
+
+static void stop_func_tracer(int graph)
+{
+ tracer_enabled = 0;
+
+ if (!graph)
+ unregister_ftrace_function(&trace_ops);
+ else
+ unregister_ftrace_graph();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+
+ if (!(bit & TRACE_DISPLAY_GRAPH))
+ return -EINVAL;
+
+ if (!(is_graph() ^ set))
+ return 0;
+
+ stop_func_tracer(!set);
+
+ wakeup_reset(wakeup_trace);
+ tracing_max_latency = 0;
+
+ return start_func_tracer(set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc, ret = 0;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return 0;
+
+ local_save_flags(flags);
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+ preempt_enable_notrace();
+
+ return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
+
+ local_save_flags(flags);
+ __trace_graph_return(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+
+ preempt_enable_notrace();
+ return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+ if (is_graph())
+ graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+ if (iter->private)
+ graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+ /*
+ * In graph mode call the graph tracer output function,
+ * otherwise go with the TRACE_FN event handler
+ */
+ if (is_graph())
+ return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+ if (is_graph())
+ print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+ else
+ trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ if (is_graph())
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
+ trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+ return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
/*
* Should this new latency be reported/recorded?
*/
@@ -107,11 +297,19 @@ static int report_latency(cycle_t delta)
return 1;
}
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
+{
+ if (task != wakeup_task)
+ return;
+
+ wakeup_current_cpu = cpu;
+}
+
static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
+probe_wakeup_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
- unsigned long latency = 0, t0 = 0, t1 = 0;
struct trace_array_cpu *data;
cycle_t T0, T1, delta;
unsigned long flags;
@@ -145,7 +343,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
goto out;
local_irq_save(flags);
- __raw_spin_lock(&wakeup_lock);
+ arch_spin_lock(&wakeup_lock);
/* We could race with grabbing wakeup_lock */
if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -154,13 +352,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
/* The task we are waiting for is waking up */
data = wakeup_trace->data[wakeup_cpu];
- trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
- /*
- * usecs conversion is slow so we try to delay the conversion
- * as long as possible:
- */
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
delta = T1-T0;
@@ -168,17 +362,14 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
if (!report_latency(delta))
goto out_unlock;
- latency = nsecs_to_usecs(delta);
-
- tracing_max_latency = delta;
- t0 = nsecs_to_usecs(T0);
- t1 = nsecs_to_usecs(T1);
-
- update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+ if (likely(!is_tracing_stopped())) {
+ tracing_max_latency = delta;
+ update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+ }
out_unlock:
__wakeup_reset(wakeup_trace);
- __raw_spin_unlock(&wakeup_lock);
+ arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -186,11 +377,6 @@ out:
static void __wakeup_reset(struct trace_array *tr)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- tracing_reset(tr, cpu);
-
wakeup_cpu = -1;
wakeup_prio = -1;
@@ -204,15 +390,17 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
+ tracing_reset_online_cpus(tr);
+
local_irq_save(flags);
- __raw_spin_lock(&wakeup_lock);
+ arch_spin_lock(&wakeup_lock);
__wakeup_reset(tr);
- __raw_spin_unlock(&wakeup_lock);
+ arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
}
static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
@@ -237,7 +425,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
goto out;
/* interrupts should be off from try_to_wake_up */
- __raw_spin_lock(&wakeup_lock);
+ arch_spin_lock(&wakeup_lock);
/* check for races. */
if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -247,6 +435,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
__wakeup_reset(wakeup_trace);
wakeup_cpu = task_cpu(p);
+ wakeup_current_cpu = wakeup_cpu;
wakeup_prio = p->prio;
wakeup_task = p;
@@ -263,10 +452,10 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
* is not called by an assembly function (where as schedule is)
* it should be safe to use it here.
*/
- trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
out_locked:
- __raw_spin_unlock(&wakeup_lock);
+ arch_spin_unlock(&wakeup_lock);
out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
@@ -275,27 +464,34 @@ static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
- ret = register_trace_sched_wakeup(probe_wakeup);
+ ret = register_trace_sched_wakeup(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return;
}
- ret = register_trace_sched_wakeup_new(probe_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
+ ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_migrate_task\n");
+ return;
+ }
+
wakeup_reset(tr);
/*
@@ -307,27 +503,24 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
- register_ftrace_function(&trace_ops);
-
- if (tracing_is_enabled())
- tracer_enabled = 1;
- else
- tracer_enabled = 0;
+ if (start_func_tracer(is_graph()))
+ printk(KERN_ERR "failed to start wakeup tracer\n");
return;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_wakeup);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_wakeup);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
}
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- unregister_ftrace_function(&trace_ops);
- unregister_trace_sched_switch(probe_wakeup_sched_switch);
- unregister_trace_sched_wakeup_new(probe_wakeup);
- unregister_trace_sched_wakeup(probe_wakeup);
+ stop_func_tracer(is_graph());
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
+ unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
static int __wakeup_tracer_init(struct trace_array *tr)
@@ -382,9 +575,16 @@ static struct tracer wakeup_tracer __read_mostly =
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
.print_max = 1,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
+ .use_max_tr = 1,
};
static struct tracer wakeup_rt_tracer __read_mostly =
@@ -396,9 +596,16 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.stop = wakeup_tracer_stop,
.wait_pipe = poll_wait_pipe,
.print_max = 1,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
+ .use_max_tr = 1,
};
__init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd..659732eba07 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
#include <linux/stringify.h>
#include <linux/kthread.h>
#include <linux/delay.h>
+#include <linux/slab.h>
static inline int trace_valid_entry(struct trace_entry *entry)
{
@@ -12,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_WAKE:
case TRACE_STACK:
case TRACE_PRINT:
- case TRACE_SPECIAL:
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
case TRACE_GRAPH_RET:
- case TRACE_HW_BRANCHES:
return 1;
}
return 0;
@@ -28,7 +27,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
struct trace_entry *entry;
unsigned int loops = 0;
- while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+ while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
entry = ring_buffer_event_data(event);
/*
@@ -66,7 +65,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
/* Don't allow flipping of max traces now */
local_irq_save(flags);
- __raw_spin_lock(&ftrace_max_lock);
+ arch_spin_lock(&ftrace_max_lock);
cnt = ring_buffer_entries(tr->buffer);
@@ -84,7 +83,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
break;
}
tracing_on();
- __raw_spin_unlock(&ftrace_max_lock);
+ arch_spin_unlock(&ftrace_max_lock);
local_irq_restore(flags);
if (count)
@@ -254,7 +253,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -265,7 +265,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
if (ftrace_dump_on_oops)
- __ftrace_dump(false);
+ __ftrace_dump(false, DUMP_ALL);
return 0;
}
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(tr);
+ set_graph_array(tr);
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry_watchdog);
if (ret) {
@@ -557,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
static int trace_wakeup_test_thread(void *data)
{
/* Make this a RT thread, doesn't need to be too high */
- struct sched_param param = { .sched_priority = 5 };
+ static const struct sched_param param = { .sched_priority = 5 };
struct completion *x = data;
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -688,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
}
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-#ifdef CONFIG_SYSPROF_TRACER
-int
-trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
-{
- unsigned long count;
- int ret;
-
- /* start the tracing */
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /* Sleep for a 1/10 of a second */
- msleep(100);
- /* stop the tracing. */
- tracing_stop();
- /* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT ".. no entries found ..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_SYSPROF_TRACER */
-
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
@@ -752,58 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_BRANCH_TRACER */
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
- struct trace_array *tr)
-{
- struct trace_iterator *iter;
- struct tracer tracer;
- unsigned long count;
- int ret;
-
- if (!trace->open) {
- printk(KERN_CONT "missing open function...");
- return -1;
- }
-
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /*
- * The hw-branch tracer needs to collect the trace from the various
- * cpu trace buffers - before tracing is stopped.
- */
- iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
-
- memcpy(&tracer, trace, sizeof(tracer));
-
- iter->trace = &tracer;
- iter->tr = tr;
- iter->pos = -1;
- mutex_init(&iter->mutex);
-
- trace->open(iter);
-
- mutex_destroy(&iter->mutex);
- kfree(iter);
-
- tracing_stop();
-
- ret = trace_test_buffer(tr, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT "no entries found..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71db..4c5dead0c23 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
};
static unsigned long max_stack_size;
-static raw_spinlock_t max_stack_lock =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t max_stack_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
static int stack_trace_disabled __read_mostly;
static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
return;
local_irq_save(flags);
- __raw_spin_lock(&max_stack_lock);
+ arch_spin_lock(&max_stack_lock);
/* a race could have already updated it */
if (this_size <= max_stack_size)
@@ -103,19 +103,19 @@ static inline void check_stack(void)
}
out:
- __raw_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&max_stack_lock);
local_irq_restore(flags);
}
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
- int cpu, resched;
+ int cpu;
if (unlikely(!ftrace_enabled || stack_trace_disabled))
return;
- resched = ftrace_preempt_disable();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
/* no atomic needed, we only modify this variable by this cpu */
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
out:
per_cpu(trace_active, cpu)--;
/* prevent recursion in schedule */
- ftrace_preempt_enable(resched);
+ preempt_enable_notrace();
}
static struct ftrace_ops trace_ops __read_mostly =
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
unsigned long val, flags;
char buf[64];
int ret;
+ int cpu;
if (count >= sizeof(buf))
return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
return ret;
local_irq_save(flags);
- __raw_spin_lock(&max_stack_lock);
+
+ /*
+ * In case we trace inside arch_spin_lock() or after (NMI),
+ * we will cause circular lock, so we also need to increase
+ * the percpu trace_active here.
+ */
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)++;
+
+ arch_spin_lock(&max_stack_lock);
*ptr = val;
- __raw_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&max_stack_lock);
+
+ per_cpu(trace_active, cpu)--;
local_irq_restore(flags);
return count;
@@ -183,66 +195,62 @@ static const struct file_operations stack_max_size_fops = {
.open = tracing_open_generic,
.read = stack_max_size_read,
.write = stack_max_size_write,
+ .llseek = default_llseek,
};
static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+__next(struct seq_file *m, loff_t *pos)
{
- long i;
+ long n = *pos - 1;
- (*pos)++;
-
- if (v == SEQ_START_TOKEN)
- i = 0;
- else {
- i = *(long *)v;
- i++;
- }
-
- if (i >= max_stack_trace.nr_entries ||
- stack_dump_trace[i] == ULONG_MAX)
+ if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
- m->private = (void *)i;
-
+ m->private = (void *)n;
return &m->private;
}
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return __next(m, pos);
+}
+
static void *t_start(struct seq_file *m, loff_t *pos)
{
- void *t = SEQ_START_TOKEN;
- loff_t l = 0;
+ int cpu;
local_irq_disable();
- __raw_spin_lock(&max_stack_lock);
+
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)++;
+
+ arch_spin_lock(&max_stack_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
- for (; t && l < *pos; t = t_next(m, t, &l))
- ;
-
- return t;
+ return __next(m, pos);
}
static void t_stop(struct seq_file *m, void *p)
{
- __raw_spin_unlock(&max_stack_lock);
+ int cpu;
+
+ arch_spin_unlock(&max_stack_lock);
+
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)--;
+
local_irq_enable();
}
static int trace_lookup_stack(struct seq_file *m, long i)
{
unsigned long addr = stack_dump_trace[i];
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
-
- sprint_symbol(str, addr);
- return seq_printf(m, "%s\n", str);
-#else
- return seq_printf(m, "%p\n", (void*)addr);
-#endif
+ return seq_printf(m, "%pS\n", (void *)addr);
}
static void print_disabled(struct seq_file *m)
@@ -301,35 +309,32 @@ static const struct seq_operations stack_trace_seq_ops = {
static int stack_trace_open(struct inode *inode, struct file *file)
{
- int ret;
-
- ret = seq_open(file, &stack_trace_seq_ops);
-
- return ret;
+ return seq_open(file, &stack_trace_seq_ops);
}
static const struct file_operations stack_trace_fops = {
.open = stack_trace_open,
.read = seq_read,
.llseek = seq_lseek,
+ .release = seq_release,
};
int
stack_trace_sysctl(struct ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *lenp,
+ void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
mutex_lock(&stack_sysctl_mutex);
- ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write ||
- (last_stack_tracer_enabled == stack_tracer_enabled))
+ (last_stack_tracer_enabled == !!stack_tracer_enabled))
goto out;
- last_stack_tracer_enabled = stack_tracer_enabled;
+ last_stack_tracer_enabled = !!stack_tracer_enabled;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index c00643733f4..96cffb269e7 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
#include <linux/list.h>
+#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/debugfs.h>
#include "trace_stat.h"
@@ -49,7 +50,8 @@ static struct dentry *stat_dir;
* but it will at least advance closer to the next one
* to be released.
*/
-static struct rb_node *release_next(struct rb_node *node)
+static struct rb_node *release_next(struct tracer_stat *ts,
+ struct rb_node *node)
{
struct stat_node *snode;
struct rb_node *parent = rb_parent(node);
@@ -67,26 +69,35 @@ static struct rb_node *release_next(struct rb_node *node)
parent->rb_right = NULL;
snode = container_of(node, struct stat_node, node);
+ if (ts->stat_release)
+ ts->stat_release(snode->stat);
kfree(snode);
return parent;
}
}
-static void reset_stat_session(struct stat_session *session)
+static void __reset_stat_session(struct stat_session *session)
{
struct rb_node *node = session->stat_root.rb_node;
while (node)
- node = release_next(node);
+ node = release_next(session->ts, node);
session->stat_root = RB_ROOT;
}
+static void reset_stat_session(struct stat_session *session)
+{
+ mutex_lock(&session->stat_mutex);
+ __reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+}
+
static void destroy_session(struct stat_session *session)
{
debugfs_remove(session->file);
- reset_stat_session(session);
+ __reset_stat_session(session);
mutex_destroy(&session->stat_mutex);
kfree(session);
}
@@ -150,7 +161,7 @@ static int stat_seq_init(struct stat_session *session)
int i;
mutex_lock(&session->stat_mutex);
- reset_stat_session(session);
+ __reset_stat_session(session);
if (!ts->stat_cmp)
ts->stat_cmp = dummy_cmp;
@@ -183,7 +194,7 @@ exit:
return ret;
exit_free_rbtree:
- reset_stat_session(session);
+ __reset_stat_session(session);
mutex_unlock(&session->stat_mutex);
return ret;
}
@@ -193,23 +204,23 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
{
struct stat_session *session = s->private;
struct rb_node *node;
+ int n = *pos;
int i;
/* Prevent from tracer switch or rbtree modification */
mutex_lock(&session->stat_mutex);
/* If we are in the beginning of the file, print the headers */
- if (!*pos && session->ts->stat_headers) {
- (*pos)++;
- return SEQ_START_TOKEN;
+ if (session->ts->stat_headers) {
+ if (n == 0)
+ return SEQ_START_TOKEN;
+ n--;
}
node = rb_first(&session->stat_root);
- for (i = 0; node && i < *pos; i++)
+ for (i = 0; node && i < n; i++)
node = rb_next(node);
- (*pos)++;
-
return node;
}
@@ -254,16 +265,21 @@ static const struct seq_operations trace_stat_seq_ops = {
static int tracing_stat_open(struct inode *inode, struct file *file)
{
int ret;
-
+ struct seq_file *m;
struct stat_session *session = inode->i_private;
+ ret = stat_seq_init(session);
+ if (ret)
+ return ret;
+
ret = seq_open(file, &trace_stat_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = session;
- ret = stat_seq_init(session);
+ if (ret) {
+ reset_stat_session(session);
+ return ret;
}
+ m = file->private_data;
+ m->private = session;
return ret;
}
@@ -274,11 +290,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
{
struct stat_session *session = i->i_private;
- mutex_lock(&session->stat_mutex);
reset_stat_session(session);
- mutex_unlock(&session->stat_mutex);
- return 0;
+ return seq_release(i, f);
}
static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd82..8f03914b9a6 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
int (*stat_cmp)(void *p1, void *p2);
/* Print a stat entry */
int (*stat_show)(struct seq_file *s, void *p);
+ /* Release an entry */
+ void (*stat_release)(void *stat);
/* Print the headers of your stat entries */
int (*stat_headers)(struct seq_file *s);
};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac8..bac752f0cfb 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,33 +1,109 @@
#include <trace/syscall.h>
+#include <trace/events/syscalls.h>
+#include <linux/slab.h>
#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/perf_event.h>
#include <asm/syscall.h>
#include "trace_output.h"
#include "trace.h"
-/* Keep a counter of the syscall tracing users */
-static int refcount;
-
-/* Prevent from races on thread flags toggling */
static DEFINE_MUTEX(syscall_trace_lock);
+static int sys_refcount_enter;
+static int sys_refcount_exit;
+static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+
+/* All syscall exit events have the same fields */
+static LIST_HEAD(syscall_exit_fields);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->enter_fields;
+}
+
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+ return &syscall_exit_fields;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+ .trace = print_syscall_enter,
+};
-/* Option to display the parameters types */
-enum {
- TRACE_SYSCALLS_OPT_TYPES = 0x1,
+struct trace_event_functions exit_syscall_print_funcs = {
+ .trace = print_syscall_exit,
};
-static struct tracer_opt syscalls_opts[] = {
- { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
- { }
+struct ftrace_event_class event_class_syscall_enter = {
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
};
-static struct tracer_flags syscalls_flags = {
- .val = 0, /* By default: no parameters types */
- .opts = syscalls_opts
+struct ftrace_event_class event_class_syscall_exit = {
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .get_fields = syscall_get_exit_fields,
+ .raw_init = init_syscall_trace,
};
+extern unsigned long __start_syscalls_metadata[];
+extern unsigned long __stop_syscalls_metadata[];
+
+static struct syscall_metadata **syscalls_metadata;
+
+static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+{
+ struct syscall_metadata *start;
+ struct syscall_metadata *stop;
+ char str[KSYM_SYMBOL_LEN];
+
+
+ start = (struct syscall_metadata *)__start_syscalls_metadata;
+ stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+ kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+
+ for ( ; start < stop; start++) {
+ /*
+ * Only compare after the "sys" prefix. Archs that use
+ * syscall wrappers may have syscalls symbols aliases prefixed
+ * with "SyS" instead of "sys", leading to an unwanted
+ * mismatch.
+ */
+ if (start->name && !strcmp(start->name + 3, str + 3))
+ return start;
+ }
+ return NULL;
+}
+
+static struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+ if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
+ return NULL;
+
+ return syscalls_metadata[nr];
+}
+
enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -35,40 +111,52 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
struct syscall_metadata *entry;
int i, ret, syscall;
- trace_assign_type(trace, ent);
-
+ trace = (typeof(trace))ent;
syscall = trace->nr;
-
entry = syscall_nr_to_meta(syscall);
+
if (!entry)
goto end;
+ if (entry->enter_event->event.type != ent->type) {
+ WARN_ON_ONCE(1);
+ goto end;
+ }
+
ret = trace_seq_printf(s, "%s(", entry->name);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
for (i = 0; i < entry->nb_args; i++) {
/* parameter types */
- if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+ if (trace_flags & TRACE_ITER_VERBOSE) {
ret = trace_seq_printf(s, "%s ", entry->types[i]);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* parameter values */
- ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
+ ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
trace->args[i],
- i == entry->nb_args - 1 ? ")" : ",");
+ i == entry->nb_args - 1 ? "" : ", ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
+ ret = trace_seq_putc(s, ')');
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
end:
- trace_seq_printf(s, "\n");
+ ret = trace_seq_putc(s, '\n');
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
return TRACE_TYPE_HANDLED;
}
enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -77,16 +165,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
struct syscall_metadata *entry;
int ret;
- trace_assign_type(trace, ent);
-
+ trace = (typeof(trace))ent;
syscall = trace->nr;
-
entry = syscall_nr_to_meta(syscall);
+
if (!entry) {
trace_seq_printf(s, "\n");
return TRACE_TYPE_HANDLED;
}
+ if (entry->exit_event->event.type != ent->type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
if (!ret)
@@ -95,62 +187,127 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-void start_ftrace_syscalls(void)
+extern char *__bad_type_size(void);
+
+#define SYSCALL_FIELD(type, name) \
+ sizeof(type) != sizeof(trace.name) ? \
+ __bad_type_size() : \
+ #type, #name, offsetof(typeof(trace), name), \
+ sizeof(trace.name), is_signed_type(type)
+
+static
+int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ int i;
+ int pos = 0;
- mutex_lock(&syscall_trace_lock);
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+ for (i = 0; i < entry->nb_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
+ entry->args[i], sizeof(unsigned long),
+ i == entry->nb_args - 1 ? "" : ", ");
+ }
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ for (i = 0; i < entry->nb_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", ((unsigned long)(REC->%s))", entry->args[i]);
+ }
- /* Don't enable the flag on the tasks twice */
- if (++refcount != 1)
- goto unlock;
+#undef LEN_OR_ZERO
- arch_init_ftrace_syscalls();
- read_lock_irqsave(&tasklist_lock, flags);
+ /* return the length of print_fmt */
+ return pos;
+}
- do_each_thread(g, t) {
- set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
- } while_each_thread(g, t);
+static int set_syscall_print_fmt(struct ftrace_event_call *call)
+{
+ char *print_fmt;
+ int len;
+ struct syscall_metadata *entry = call->data;
- read_unlock_irqrestore(&tasklist_lock, flags);
+ if (entry->enter_event != call) {
+ call->print_fmt = "\"0x%lx\", REC->ret";
+ return 0;
+ }
-unlock:
- mutex_unlock(&syscall_trace_lock);
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_enter_print_fmt(entry, NULL, 0);
+
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_enter_print_fmt(entry, print_fmt, len + 1);
+ call->print_fmt = print_fmt;
+
+ return 0;
}
-void stop_ftrace_syscalls(void)
+static void free_syscall_print_fmt(struct ftrace_event_call *call)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct syscall_metadata *entry = call->data;
- mutex_lock(&syscall_trace_lock);
+ if (entry->enter_event == call)
+ kfree(call->print_fmt);
+}
- /* There are perhaps still some users */
- if (--refcount)
- goto unlock;
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
+{
+ struct syscall_trace_enter trace;
+ struct syscall_metadata *meta = call->data;
+ int ret;
+ int i;
+ int offset = offsetof(typeof(trace), args);
+
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < meta->nb_args; i++) {
+ ret = trace_define_field(call, meta->types[i],
+ meta->args[i], offset,
+ sizeof(unsigned long), 0,
+ FILTER_OTHER);
+ offset += sizeof(unsigned long);
+ }
- read_lock_irqsave(&tasklist_lock, flags);
+ return ret;
+}
- do_each_thread(g, t) {
- clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
- } while_each_thread(g, t);
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
+{
+ struct syscall_trace_exit trace;
+ int ret;
- read_unlock_irqrestore(&tasklist_lock, flags);
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ if (ret)
+ return ret;
-unlock:
- mutex_unlock(&syscall_trace_lock);
+ ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+ FILTER_OTHER);
+
+ return ret;
}
-void ftrace_syscall_enter(struct pt_regs *regs)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
int size;
int syscall_nr;
syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
+ if (!test_bit(syscall_nr, enabled_enter_syscalls))
+ return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
@@ -158,8 +315,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
- 0, 0);
+ event = trace_current_buffer_lock_reserve(&buffer,
+ sys_data->enter_event->event.type, size, 0, 0);
if (!event)
return;
@@ -167,25 +324,31 @@ void ftrace_syscall_enter(struct pt_regs *regs)
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
- trace_current_buffer_unlock_commit(event, 0, 0);
- trace_wake_up();
+ if (!filter_current_check_discard(buffer, sys_data->enter_event,
+ entry, event))
+ trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-void ftrace_syscall_exit(struct pt_regs *regs)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
int syscall_nr;
syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
+ if (!test_bit(syscall_nr, enabled_exit_syscalls))
+ return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
- event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
- sizeof(*entry), 0, 0);
+ event = trace_current_buffer_lock_reserve(&buffer,
+ sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -193,58 +356,325 @@ void ftrace_syscall_exit(struct pt_regs *regs)
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- trace_current_buffer_unlock_commit(event, 0, 0);
- trace_wake_up();
+ if (!filter_current_check_discard(buffer, sys_data->exit_event,
+ entry, event))
+ trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-static int init_syscall_tracer(struct trace_array *tr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
{
- start_ftrace_syscalls();
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls)
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_refcount_enter)
+ ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
+ if (!ret) {
+ set_bit(num, enabled_enter_syscalls);
+ sys_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
+{
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls)
+ return;
+ mutex_lock(&syscall_trace_lock);
+ sys_refcount_enter--;
+ clear_bit(num, enabled_enter_syscalls);
+ if (!sys_refcount_enter)
+ unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+int reg_event_syscall_exit(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls)
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_refcount_exit)
+ ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
+ if (!ret) {
+ set_bit(num, enabled_exit_syscalls);
+ sys_refcount_exit++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
+{
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls)
+ return;
+ mutex_lock(&syscall_trace_lock);
+ sys_refcount_exit--;
+ clear_bit(num, enabled_exit_syscalls);
+ if (!sys_refcount_exit)
+ unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+int init_syscall_trace(struct ftrace_event_call *call)
+{
+ int id;
+
+ if (set_syscall_print_fmt(call) < 0)
+ return -ENOMEM;
+
+ id = trace_event_raw_init(call);
+
+ if (id < 0) {
+ free_syscall_print_fmt(call);
+ return id;
+ }
+
+ return id;
+}
+
+unsigned long __init arch_syscall_addr(int nr)
+{
+ return (unsigned long)sys_call_table[nr];
+}
+
+int __init init_ftrace_syscalls(void)
+{
+ struct syscall_metadata *meta;
+ unsigned long addr;
+ int i;
+
+ syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+ NR_syscalls, GFP_KERNEL);
+ if (!syscalls_metadata) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < NR_syscalls; i++) {
+ addr = arch_syscall_addr(i);
+ meta = find_syscall_meta(addr);
+ if (!meta)
+ continue;
+
+ meta->syscall_nr = i;
+ syscalls_metadata[i] = meta;
+ }
return 0;
}
+core_initcall(init_ftrace_syscalls);
+
+#ifdef CONFIG_PERF_EVENTS
+
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
-static void reset_syscall_tracer(struct trace_array *tr)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
- stop_ftrace_syscalls();
- tracing_reset_online_cpus(tr);
+ struct syscall_metadata *sys_data;
+ struct syscall_trace_enter *rec;
+ struct hlist_head *head;
+ int syscall_nr;
+ int rctx;
+ int size;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ /* get the size after alignment with the u32 buffer size field */
+ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+ size = ALIGN(size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "perf buffer not large enough"))
+ return;
+
+ rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
+ sys_data->enter_event->event.type, regs, &rctx);
+ if (!rec)
+ return;
+
+ rec->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+ (unsigned long *)&rec->args);
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
}
-static struct trace_event syscall_enter_event = {
- .type = TRACE_SYSCALL_ENTER,
- .trace = print_syscall_enter,
-};
+int perf_sysenter_enable(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ int num;
-static struct trace_event syscall_exit_event = {
- .type = TRACE_SYSCALL_EXIT,
- .trace = print_syscall_exit,
-};
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
-static struct tracer syscall_tracer __read_mostly = {
- .name = "syscall",
- .init = init_syscall_tracer,
- .reset = reset_syscall_tracer,
- .flags = &syscalls_flags,
-};
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_perf_refcount_enter)
+ ret = register_trace_sys_enter(perf_syscall_enter, NULL);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_perf_enter_syscalls);
+ sys_perf_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
-__init int register_ftrace_syscalls(void)
+void perf_sysenter_disable(struct ftrace_event_call *call)
{
- int ret;
+ int num;
- ret = register_ftrace_event(&syscall_enter_event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- syscall_enter_event.type);
- WARN_ON_ONCE(1);
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_perf_refcount_enter--;
+ clear_bit(num, enabled_perf_enter_syscalls);
+ if (!sys_perf_refcount_enter)
+ unregister_trace_sys_enter(perf_syscall_enter, NULL);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+{
+ struct syscall_metadata *sys_data;
+ struct syscall_trace_exit *rec;
+ struct hlist_head *head;
+ int syscall_nr;
+ int rctx;
+ int size;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ /* We can probably do that at build time */
+ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ /*
+ * Impossible, but be paranoid with the future
+ * How to put this check outside runtime?
+ */
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "exit event has grown above perf buffer size"))
+ return;
+
+ rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
+ sys_data->exit_event->event.type, regs, &rctx);
+ if (!rec)
+ return;
+
+ rec->nr = syscall_nr;
+ rec->ret = syscall_get_return_value(current, regs);
+
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+}
+
+int perf_sysexit_enable(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_perf_refcount_exit)
+ ret = register_trace_sys_exit(perf_syscall_exit, NULL);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall exit trace point");
+ } else {
+ set_bit(num, enabled_perf_exit_syscalls);
+ sys_perf_refcount_exit++;
}
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
- ret = register_ftrace_event(&syscall_exit_event);
- if (!ret) {
- printk(KERN_WARNING "event %d failed to register\n",
- syscall_exit_event.type);
- WARN_ON_ONCE(1);
+void perf_sysexit_disable(struct ftrace_event_call *call)
+{
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_perf_refcount_exit--;
+ clear_bit(num, enabled_perf_exit_syscalls);
+ if (!sys_perf_refcount_exit)
+ unregister_trace_sys_exit(perf_syscall_exit, NULL);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_enter(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_enter(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysenter_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysenter_disable(event);
+ return 0;
+#endif
}
+ return 0;
+}
- return register_tracer(&syscall_tracer);
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_exit(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_exit(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysexit_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysexit_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
}
-device_initcall(register_ftrace_syscalls);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
deleted file mode 100644
index f6693969287..00000000000
--- a/kernel/trace/trace_sysprof.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * trace stack traces
- *
- * Copyright (C) 2004-2008, Soeren Sandmann
- * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
- * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/hrtimer.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/irq.h>
-#include <linux/fs.h>
-
-#include <asm/stacktrace.h>
-
-#include "trace.h"
-
-static struct trace_array *sysprof_trace;
-static int __read_mostly tracer_enabled;
-
-/*
- * 1 msec sample interval by default:
- */
-static unsigned long sample_period = 1000000;
-static const unsigned int sample_max_depth = 512;
-
-static DEFINE_MUTEX(sample_timer_lock);
-/*
- * Per CPU hrtimers that do the profiling:
- */
-static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
-
-struct stack_frame {
- const void __user *next_fp;
- unsigned long return_address;
-};
-
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
-{
- int ret;
-
- if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
- return 0;
-
- ret = 1;
- pagefault_disable();
- if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
- ret = 0;
- pagefault_enable();
-
- return ret;
-}
-
-struct backtrace_info {
- struct trace_array_cpu *data;
- struct trace_array *tr;
- int pos;
-};
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
- /* Don't bother with IRQ stacks for now */
- return -1;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct backtrace_info *info = data;
-
- if (info->pos < sample_max_depth && reliable) {
- __trace_special(info->tr, info->data, 1, addr, 0);
-
- info->pos++;
- }
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
- .stack = backtrace_stack,
- .address = backtrace_address,
-};
-
-static int
-trace_kernel(struct pt_regs *regs, struct trace_array *tr,
- struct trace_array_cpu *data)
-{
- struct backtrace_info info;
- unsigned long bp;
- char *stack;
-
- info.tr = tr;
- info.data = data;
- info.pos = 1;
-
- __trace_special(info.tr, info.data, 1, regs->ip, 0);
-
- stack = ((char *)regs + sizeof(struct pt_regs));
-#ifdef CONFIG_FRAME_POINTER
- bp = regs->bp;
-#else
- bp = 0;
-#endif
-
- dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
-
- return info.pos;
-}
-
-static void timer_notify(struct pt_regs *regs, int cpu)
-{
- struct trace_array_cpu *data;
- struct stack_frame frame;
- struct trace_array *tr;
- const void __user *fp;
- int is_user;
- int i;
-
- if (!regs)
- return;
-
- tr = sysprof_trace;
- data = tr->data[cpu];
- is_user = user_mode(regs);
-
- if (!current || current->pid == 0)
- return;
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- __trace_special(tr, data, 0, 0, current->pid);
-
- if (!is_user)
- i = trace_kernel(regs, tr, data);
- else
- i = 0;
-
- /*
- * Trace user stack if we are not a kernel thread
- */
- if (current->mm && i < sample_max_depth) {
- regs = (struct pt_regs *)current->thread.sp0 - 1;
-
- fp = (void __user *)regs->bp;
-
- __trace_special(tr, data, 2, regs->ip, 0);
-
- while (i < sample_max_depth) {
- frame.next_fp = NULL;
- frame.return_address = 0;
- if (!copy_stack_frame(fp, &frame))
- break;
- if ((unsigned long)fp < regs->sp)
- break;
-
- __trace_special(tr, data, 2, frame.return_address,
- (unsigned long)fp);
- fp = frame.next_fp;
-
- i++;
- }
-
- }
-
- /*
- * Special trace entry if we overflow the max depth:
- */
- if (i == sample_max_depth)
- __trace_special(tr, data, -1, -1, -1);
-
- __trace_special(tr, data, 3, current->pid, i);
-}
-
-static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-{
- /* trace here */
- timer_notify(get_irq_regs(), smp_processor_id());
-
- hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
-
- return HRTIMER_RESTART;
-}
-
-static void start_stack_timer(void *unused)
-{
- struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
-
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = stack_trace_timer_fn;
-
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
-}
-
-static void start_stack_timers(void)
-{
- on_each_cpu(start_stack_timer, NULL, 1);
-}
-
-static void stop_stack_timer(int cpu)
-{
- struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
-
- hrtimer_cancel(hrtimer);
-}
-
-static void stop_stack_timers(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- stop_stack_timer(cpu);
-}
-
-static void stop_stack_trace(struct trace_array *tr)
-{
- mutex_lock(&sample_timer_lock);
- stop_stack_timers();
- tracer_enabled = 0;
- mutex_unlock(&sample_timer_lock);
-}
-
-static int stack_trace_init(struct trace_array *tr)
-{
- sysprof_trace = tr;
-
- tracing_start_cmdline_record();
-
- mutex_lock(&sample_timer_lock);
- start_stack_timers();
- tracer_enabled = 1;
- mutex_unlock(&sample_timer_lock);
- return 0;
-}
-
-static void stack_trace_reset(struct trace_array *tr)
-{
- tracing_stop_cmdline_record();
- stop_stack_trace(tr);
-}
-
-static struct tracer stack_trace __read_mostly =
-{
- .name = "sysprof",
- .init = stack_trace_init,
- .reset = stack_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_sysprof,
-#endif
-};
-
-__init static int init_stack_trace(void)
-{
- return register_tracer(&stack_trace);
-}
-device_initcall(init_stack_trace);
-
-#define MAX_LONG_DIGITS 22
-
-static ssize_t
-sysprof_sample_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_LONG_DIGITS];
- int r;
-
- r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-sysprof_sample_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[MAX_LONG_DIGITS];
- unsigned long val;
-
- if (cnt > MAX_LONG_DIGITS-1)
- cnt = MAX_LONG_DIGITS-1;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- val = simple_strtoul(buf, NULL, 10);
- /*
- * Enforce a minimum sample period of 100 usecs:
- */
- if (val < 100)
- val = 100;
-
- mutex_lock(&sample_timer_lock);
- stop_stack_timers();
- sample_period = val * 1000;
- start_stack_timers();
- mutex_unlock(&sample_timer_lock);
-
- return cnt;
-}
-
-static const struct file_operations sysprof_sample_fops = {
- .read = sysprof_sample_read,
- .write = sysprof_sample_write,
-};
-
-void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
-{
-
- trace_create_file("sysprof_sample_period", 0644,
- d_tracer, NULL, &sysprof_sample_fops);
-}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce..209b379a472 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,8 @@
#include <trace/events/workqueue.h>
#include <linux/list.h>
#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/kref.h>
#include "trace_stat.h"
#include "trace.h"
@@ -16,6 +18,7 @@
/* A cpu workqueue thread */
struct cpu_workqueue_stats {
struct list_head list;
+ struct kref kref;
int cpu;
pid_t pid;
/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,9 +42,15 @@ struct workqueue_global_stats {
static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
+static void cpu_workqueue_stat_free(struct kref *kref)
+{
+ kfree(container_of(kref, struct cpu_workqueue_stats, kref));
+}
+
/* Insertion of a work */
static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -62,7 +71,8 @@ found:
/* Execution of a work */
static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -82,7 +92,8 @@ found:
}
/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+ struct task_struct *wq_thread, int cpu)
{
struct cpu_workqueue_stats *cws;
unsigned long flags;
@@ -96,8 +107,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
return;
}
INIT_LIST_HEAD(&cws->list);
+ kref_init(&cws->kref);
cws->cpu = cpu;
-
cws->pid = wq_thread->pid;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -106,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
}
/* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
{
/* Workqueue only execute on one cpu */
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -118,7 +130,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
list) {
if (node->pid == wq_thread->pid) {
list_del(&node->list);
- kfree(node);
+ kref_put(&node->kref, cpu_workqueue_stat_free);
goto found;
}
}
@@ -137,9 +149,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
- if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+ if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
struct cpu_workqueue_stats, list);
+ kref_get(&ret->kref);
+ }
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -162,9 +176,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
static void *workqueue_stat_next(void *prev, int idx)
{
struct cpu_workqueue_stats *prev_cws = prev;
+ struct cpu_workqueue_stats *ret;
int cpu = prev_cws->cpu;
unsigned long flags;
- void *ret = NULL;
spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +189,14 @@ static void *workqueue_stat_next(void *prev, int idx)
return NULL;
} while (!(ret = workqueue_stat_start_cpu(cpu)));
return ret;
+ } else {
+ ret = list_entry(prev_cws->list.next,
+ struct cpu_workqueue_stats, list);
+ kref_get(&ret->kref);
}
spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
- return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
- list);
+ return ret;
}
static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +220,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
return 0;
}
+static void workqueue_stat_release(void *stat)
+{
+ struct cpu_workqueue_stats *node = stat;
+
+ kref_put(&node->kref, cpu_workqueue_stat_free);
+}
+
static int workqueue_stat_headers(struct seq_file *s)
{
seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +239,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
.stat_start = workqueue_stat_start,
.stat_next = workqueue_stat_next,
.stat_show = workqueue_stat_show,
+ .stat_release = workqueue_stat_release,
.stat_headers = workqueue_stat_headers
};
@@ -238,35 +263,35 @@ int __init trace_workqueue_early_init(void)
{
int ret, cpu;
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+ for_each_possible_cpu(cpu) {
+ spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+ INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+ }
+
+ ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
if (ret)
goto out;
- ret = register_trace_workqueue_execution(probe_workqueue_execution);
+ ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
if (ret)
goto no_insertion;
- ret = register_trace_workqueue_creation(probe_workqueue_creation);
+ ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
if (ret)
goto no_execution;
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+ ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
if (ret)
goto no_creation;
- for_each_possible_cpu(cpu) {
- spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
- INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
- }
-
return 0;
no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation);
+ unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution);
+ unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+ unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
out:
pr_warning("trace_workqueue: unable to trace workqueues\n");