aboutsummaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig64
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c275
-rw-r--r--kernel/trace/ring_buffer.c2014
-rw-r--r--kernel/trace/trace.c1845
-rw-r--r--kernel/trace/trace.h211
-rw-r--r--kernel/trace/trace_boot.c126
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_mmiotrace.c116
-rw-r--r--kernel/trace/trace_nop.c64
-rw-r--r--kernel/trace/trace_sched_switch.c137
-rw-r--r--kernel/trace/trace_sched_wakeup.c148
-rw-r--r--kernel/trace/trace_selftest.c83
-rw-r--r--kernel/trace/trace_stack.c310
-rw-r--r--kernel/trace/trace_sysprof.c2
16 files changed, 4102 insertions, 1318 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 263e9e6bbd6..1cb3e1f616a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,23 +1,37 @@
#
# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
#
+
+config NOP_TRACER
+ bool
+
config HAVE_FTRACE
bool
+ select NOP_TRACER
config HAVE_DYNAMIC_FTRACE
bool
+config HAVE_FTRACE_MCOUNT_RECORD
+ bool
+
config TRACER_MAX_TRACE
bool
+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
+ select RING_BUFFER
select STACKTRACE
+ select TRACEPOINTS
config FTRACE
bool "Kernel Function Tracer"
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select FRAME_POINTER
select TRACING
select CONTEXT_SWITCH_TRACER
@@ -36,6 +50,7 @@ config IRQSOFF_TRACER
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACE_IRQFLAGS
select TRACING
select TRACER_MAX_TRACE
@@ -59,6 +74,7 @@ config PREEMPT_TRACER
depends on GENERIC_TIME
depends on PREEMPT
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACING
select TRACER_MAX_TRACE
help
@@ -86,6 +102,7 @@ config SYSPROF_TRACER
config SCHED_TRACER
bool "Scheduling Latency Tracer"
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
@@ -96,16 +113,56 @@ config SCHED_TRACER
config CONTEXT_SWITCH_TRACER
bool "Trace process context switches"
depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
select TRACING
select MARKERS
help
This tracer gets called from the context switch and records
all switching of tasks.
+config BOOT_TRACER
+ bool "Trace boot initcalls"
+ depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
+ select TRACING
+ help
+ This tracer helps developers to optimize boot times: it records
+ the timings of the initcalls and traces key events and the identity
+ of tasks that can cause boot delays, such as context-switches.
+
+ Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+ produce pretty graphics about boot inefficiencies, giving a visual
+ representation of the delays during initcalls - but the raw
+ /debug/tracing/trace text output is readable too.
+
+ ( Note that tracing self tests can't be enabled if this tracer is
+ selected, because the self-tests are an initcall as well and that
+ would invalidate the boot trace. )
+
+config STACK_TRACER
+ bool "Trace max stack"
+ depends on HAVE_FTRACE
+ depends on DEBUG_KERNEL
+ select FTRACE
+ select STACKTRACE
+ help
+ This special tracer records the maximum stack footprint of the
+ kernel and displays it in debugfs/tracing/stack_trace.
+
+ This tracer works by hooking into every function call that the
+ kernel executes, and keeping a maximum stack depth value and
+ stack-trace saved. Because this logic has to execute in every
+ kernel function, all the time, this option can slow down the
+ kernel measurably and is generally intended for kernel
+ developers only.
+
+ Say N if unsure.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FTRACE
depends on HAVE_DYNAMIC_FTRACE
+ depends on DEBUG_KERNEL
default y
help
This option will modify all the calls to ftrace dynamically
@@ -121,12 +178,17 @@ config DYNAMIC_FTRACE
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
+config FTRACE_MCOUNT_RECORD
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_FTRACE_MCOUNT_RECORD
+
config FTRACE_SELFTEST
bool
config FTRACE_STARTUP_TEST
bool "Perform a startup test on ftrace"
- depends on TRACING
+ depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
select FTRACE_SELFTEST
help
This option performs a series of startup tests on ftrace. On bootup
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 71d17de1728..a85dfba88ba 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif
obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
@@ -19,6 +20,9 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_NOP_TRACER) += trace_nop.o
+obj-$(CONFIG_STACK_TRACER) += trace_stack.o
obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6e3af31b40..4dda4f60a2a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -81,7 +81,7 @@ void clear_ftrace_function(void)
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- /* Should never be called by interrupts */
+ /* should not be called from interrupt context */
spin_lock(&ftrace_lock);
ops->next = ftrace_list;
@@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
struct ftrace_ops **p;
int ret = 0;
+ /* should not be called from interrupt context */
spin_lock(&ftrace_lock);
/*
@@ -153,6 +154,30 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+/*
+ * The hash lock is only needed when the recording of the mcount
+ * callers are dynamic. That is, by the caller themselves and
+ * not recorded via the compilation.
+ */
+static DEFINE_SPINLOCK(ftrace_hash_lock);
+#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags)
+#define ftrace_hash_unlock(flags) \
+ spin_unlock_irqrestore(&ftrace_hash_lock, flags)
+#else
+/* This is protected via the ftrace_lock with MCOUNT_RECORD. */
+#define ftrace_hash_lock(flags) do { (void)(flags); } while (0)
+#define ftrace_hash_unlock(flags) do { } while(0)
+#endif
+
+/*
+ * Since MCOUNT_ADDR may point to mcount itself, we do not want
+ * to get it confused by reading a reference in the code as we
+ * are parsing on objcopy output of text. Use a variable for
+ * it instead.
+ */
+static unsigned long mcount_addr = MCOUNT_ADDR;
+
static struct task_struct *ftraced_task;
enum {
@@ -171,7 +196,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
-static DEFINE_SPINLOCK(ftrace_shutdown_lock);
static DEFINE_MUTEX(ftraced_lock);
static DEFINE_MUTEX(ftrace_regex_lock);
@@ -294,13 +318,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node)
static void ftrace_free_rec(struct dyn_ftrace *rec)
{
- /* no locking, only called from kstop_machine */
-
rec->ip = (unsigned long)ftrace_free_records;
ftrace_free_records = rec;
rec->flags |= FTRACE_FL_FREE;
}
+void ftrace_release(void *start, unsigned long size)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ unsigned long s = (unsigned long)start;
+ unsigned long e = s + size;
+ int i;
+
+ if (ftrace_disabled || !start)
+ return;
+
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ for (i = 0; i < pg->index; i++) {
+ rec = &pg->records[i];
+
+ if ((rec->ip >= s) && (rec->ip < e))
+ ftrace_free_rec(rec);
+ }
+ }
+ spin_unlock(&ftrace_lock);
+
+}
+
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
{
struct dyn_ftrace *rec;
@@ -338,7 +386,6 @@ ftrace_record_ip(unsigned long ip)
unsigned long flags;
unsigned long key;
int resched;
- int atomic;
int cpu;
if (!ftrace_enabled || ftrace_disabled)
@@ -368,9 +415,7 @@ ftrace_record_ip(unsigned long ip)
if (ftrace_ip_in_hash(ip, key))
goto out;
- atomic = irqs_disabled();
-
- spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+ ftrace_hash_lock(flags);
/* This ip may have hit the hash before the lock */
if (ftrace_ip_in_hash(ip, key))
@@ -387,7 +432,7 @@ ftrace_record_ip(unsigned long ip)
ftraced_trigger = 1;
out_unlock:
- spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ ftrace_hash_unlock(flags);
out:
per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
@@ -531,6 +576,16 @@ static void ftrace_shutdown_replenish(void)
ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
}
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+ int i;
+
+ printk(KERN_CONT "%s", fmt);
+
+ for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+ printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
static int
ftrace_code_disable(struct dyn_ftrace *rec)
{
@@ -541,10 +596,27 @@ ftrace_code_disable(struct dyn_ftrace *rec)
ip = rec->ip;
nop = ftrace_nop_replace();
- call = ftrace_call_replace(ip, MCOUNT_ADDR);
+ call = ftrace_call_replace(ip, mcount_addr);
failed = ftrace_modify_code(ip, call, nop);
if (failed) {
+ switch (failed) {
+ case 1:
+ WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on modifying ");
+ print_ip_sym(ip);
+ break;
+ case 2:
+ WARN_ON_ONCE(1);
+ pr_info("ftrace failed to modify ");
+ print_ip_sym(ip);
+ print_ip_ins(" expected: ", call);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" replace: ", nop);
+ printk(KERN_CONT "\n");
+ break;
+ }
+
rec->flags |= FTRACE_FL_FAILED;
return 0;
}
@@ -792,47 +864,7 @@ static int ftrace_update_code(void)
return 1;
}
-static int ftraced(void *ignore)
-{
- unsigned long usecs;
-
- while (!kthread_should_stop()) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- /* check once a second */
- schedule_timeout(HZ);
-
- if (unlikely(ftrace_disabled))
- continue;
-
- mutex_lock(&ftrace_sysctl_lock);
- mutex_lock(&ftraced_lock);
- if (!ftraced_suspend && !ftraced_stop &&
- ftrace_update_code()) {
- usecs = nsecs_to_usecs(ftrace_update_time);
- if (ftrace_update_tot_cnt > 100000) {
- ftrace_update_tot_cnt = 0;
- pr_info("hm, dftrace overflow: %lu change%s"
- " (%lu total) in %lu usec%s\n",
- ftrace_update_cnt,
- ftrace_update_cnt != 1 ? "s" : "",
- ftrace_update_tot_cnt,
- usecs, usecs != 1 ? "s" : "");
- ftrace_disabled = 1;
- WARN_ON_ONCE(1);
- }
- }
- mutex_unlock(&ftraced_lock);
- mutex_unlock(&ftrace_sysctl_lock);
-
- ftrace_shutdown_replenish();
- }
- __set_current_state(TASK_RUNNING);
- return 0;
-}
-
-static int __init ftrace_dyn_table_alloc(void)
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
{
struct ftrace_page *pg;
int cnt;
@@ -859,7 +891,9 @@ static int __init ftrace_dyn_table_alloc(void)
pg = ftrace_pages = ftrace_pages_start;
- cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+ cnt = num_to_init / ENTRIES_PER_PAGE;
+ pr_info("ftrace: allocating %ld hash entries in %d pages\n",
+ num_to_init, cnt);
for (i = 0; i < cnt; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -901,6 +935,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
retry:
if (iter->idx >= iter->pg->index) {
if (iter->pg->next) {
@@ -910,15 +946,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
} else {
rec = &iter->pg->records[iter->idx++];
- if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+ if ((rec->flags & FTRACE_FL_FREE) ||
+
+ (!(iter->flags & FTRACE_ITER_FAILURES) &&
(rec->flags & FTRACE_FL_FAILED)) ||
((iter->flags & FTRACE_ITER_FAILURES) &&
- (!(rec->flags & FTRACE_FL_FAILED) ||
- (rec->flags & FTRACE_FL_FREE))) ||
-
- ((iter->flags & FTRACE_ITER_FILTER) &&
- !(rec->flags & FTRACE_FL_FILTER)) ||
+ !(rec->flags & FTRACE_FL_FAILED)) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
!(rec->flags & FTRACE_FL_NOTRACE))) {
@@ -926,6 +960,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
goto retry;
}
}
+ spin_unlock(&ftrace_lock);
iter->pos = *pos;
@@ -1039,8 +1074,8 @@ static void ftrace_filter_reset(int enable)
unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
unsigned i;
- /* keep kstop machine from running */
- preempt_disable();
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
if (enable)
ftrace_filtered = 0;
pg = ftrace_pages_start;
@@ -1053,7 +1088,7 @@ static void ftrace_filter_reset(int enable)
}
pg = pg->next;
}
- preempt_enable();
+ spin_unlock(&ftrace_lock);
}
static int
@@ -1165,8 +1200,8 @@ ftrace_match(unsigned char *buff, int len, int enable)
}
}
- /* keep kstop machine from running */
- preempt_disable();
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
if (enable)
ftrace_filtered = 1;
pg = ftrace_pages_start;
@@ -1203,7 +1238,7 @@ ftrace_match(unsigned char *buff, int len, int enable)
}
pg = pg->next;
}
- preempt_enable();
+ spin_unlock(&ftrace_lock);
}
static ssize_t
@@ -1556,6 +1591,114 @@ static __init int ftrace_init_debugfs(void)
fs_initcall(ftrace_init_debugfs);
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+static int ftrace_convert_nops(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *p;
+ unsigned long addr;
+ unsigned long flags;
+
+ p = start;
+ while (p < end) {
+ addr = ftrace_call_adjust(*p++);
+ /* should not be called from interrupt context */
+ spin_lock(&ftrace_lock);
+ ftrace_record_ip(addr);
+ spin_unlock(&ftrace_lock);
+ ftrace_shutdown_replenish();
+ }
+
+ /* p is ignored */
+ local_irq_save(flags);
+ __ftrace_update_code(p);
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+void ftrace_init_module(unsigned long *start, unsigned long *end)
+{
+ if (ftrace_disabled || start == end)
+ return;
+ ftrace_convert_nops(start, end);
+}
+
+extern unsigned long __start_mcount_loc[];
+extern unsigned long __stop_mcount_loc[];
+
+void __init ftrace_init(void)
+{
+ unsigned long count, addr, flags;
+ int ret;
+
+ /* Keep the ftrace pointer to the stub */
+ addr = (unsigned long)ftrace_stub;
+
+ local_irq_save(flags);
+ ftrace_dyn_arch_init(&addr);
+ local_irq_restore(flags);
+
+ /* ftrace_dyn_arch_init places the return code in addr */
+ if (addr)
+ goto failed;
+
+ count = __stop_mcount_loc - __start_mcount_loc;
+
+ ret = ftrace_dyn_table_alloc(count);
+ if (ret)
+ goto failed;
+
+ last_ftrace_enabled = ftrace_enabled = 1;
+
+ ret = ftrace_convert_nops(__start_mcount_loc,
+ __stop_mcount_loc);
+
+ return;
+ failed:
+ ftrace_disabled = 1;
+}
+#else /* CONFIG_FTRACE_MCOUNT_RECORD */
+static int ftraced(void *ignore)
+{
+ unsigned long usecs;
+
+ while (!kthread_should_stop()) {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* check once a second */
+ schedule_timeout(HZ);
+
+ if (unlikely(ftrace_disabled))
+ continue;
+
+ mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftraced_lock);
+ if (!ftraced_suspend && !ftraced_stop &&
+ ftrace_update_code()) {
+ usecs = nsecs_to_usecs(ftrace_update_time);
+ if (ftrace_update_tot_cnt > 100000) {
+ ftrace_update_tot_cnt = 0;
+ pr_info("hm, dftrace overflow: %lu change%s"
+ " (%lu total) in %lu usec%s\n",
+ ftrace_update_cnt,
+ ftrace_update_cnt != 1 ? "s" : "",
+ ftrace_update_tot_cnt,
+ usecs, usecs != 1 ? "s" : "");
+ ftrace_disabled = 1;
+ WARN_ON_ONCE(1);
+ }
+ }
+ mutex_unlock(&ftraced_lock);
+ mutex_unlock(&ftrace_sysctl_lock);
+
+ ftrace_shutdown_replenish();
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
static int __init ftrace_dynamic_init(void)
{
struct task_struct *p;
@@ -1572,7 +1715,7 @@ static int __init ftrace_dynamic_init(void)
goto failed;
}
- ret = ftrace_dyn_table_alloc();
+ ret = ftrace_dyn_table_alloc(NR_TO_INIT);
if (ret)
goto failed;
@@ -1593,6 +1736,8 @@ static int __init ftrace_dynamic_init(void)
}
core_initcall(ftrace_dynamic_init);
+#endif /* CONFIG_FTRACE_MCOUNT_RECORD */
+
#else
# define ftrace_startup() do { } while (0)
# define ftrace_shutdown() do { } while (0)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 00000000000..94af1fe56bb
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,2014 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h> /* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
+}
+
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA 28
+
+enum {
+ RB_LEN_TIME_EXTEND = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/* inline for ring buffer fast paths */
+static inline unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return RB_LEN_TIME_EXTEND;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RINGBUF_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+unsigned ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ return rb_event_length(event);
+}
+
+/* inline for ring buffer fast paths */
+static inline void *
+rb_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RINGBUF_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+void *ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ return rb_event_data(event);
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ u64 time_stamp; /* page time stamp */
+ local_t write; /* index for next write */
+ local_t commit; /* write commited index */
+ unsigned read; /* index for next read */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
+};
+
+/*
+ * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
+ * this issue out.
+ */
+static inline void free_buffer_page(struct buffer_page *bpage)
+{
+ if (bpage->page)
+ __free_page(bpage->page);
+ kfree(bpage);
+}
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ struct buffer_page *head_page; /* read from head */
+ struct buffer_page *tail_page; /* write to tail */
+ struct buffer_page *commit_page; /* commited pages */
+ struct buffer_page *reader_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define RB_WARN_ON(buffer, cond) \
+ do { \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+#define RB_WARN_ON_RET(buffer, cond) \
+ do { \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ } \
+ } while (0)
+
+#define RB_WARN_ON_ONCE(buffer, cond) \
+ do { \
+ static int once; \
+ if (unlikely(cond) && !once) { \
+ once++; \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
+ RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ RB_WARN_ON_RET(cpu_buffer,
+ page->list.next->prev != &page->list);
+ RB_WARN_ON_RET(cpu_buffer,
+ page->list.prev->next != &page->list);
+ }
+
+ return 0;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page->page = (void *)addr;
+ }
+
+ list_splice(&pages, head);
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
+ unsigned long addr;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ spin_lock_init(&cpu_buffer->lock);
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_reader;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ return cpu_buffer;
+
+ fail_free_reader:
+ free_buffer_page(cpu_buffer->reader_page);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ list_del_init(&cpu_buffer->reader_page->list);
+ free_buffer_page(cpu_buffer->reader_page);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/*
+ * Causes compile errors if the struct buffer_page gets bigger
+ * than the struct page.
+ */
+extern int ring_buffer_page_too_big(void);
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* Paranoid! Optimizes out when all is well */
+ if (sizeof(struct buffer_page) > sizeof(struct page))
+ ring_buffer_page_too_big();
+
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ free_buffer_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ list_add_tail(&page->list, &cpu_buffer->pages);
+ }
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ unsigned long buffer_size;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ int i, cpu;
+
+ size = DIV_ROUND_UP(size, BUF_P