diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/Kconfig | 64 | ||||
-rw-r--r-- | kernel/trace/Makefile | 4 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 275 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 2014 | ||||
-rw-r--r-- | kernel/trace/trace.c | 1845 | ||||
-rw-r--r-- | kernel/trace/trace.h | 211 | ||||
-rw-r--r-- | kernel/trace/trace_boot.c | 126 | ||||
-rw-r--r-- | kernel/trace/trace_functions.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_irqsoff.c | 19 | ||||
-rw-r--r-- | kernel/trace/trace_mmiotrace.c | 116 | ||||
-rw-r--r-- | kernel/trace/trace_nop.c | 64 | ||||
-rw-r--r-- | kernel/trace/trace_sched_switch.c | 137 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 148 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 83 | ||||
-rw-r--r-- | kernel/trace/trace_stack.c | 310 | ||||
-rw-r--r-- | kernel/trace/trace_sysprof.c | 2 |
16 files changed, 4102 insertions, 1318 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 263e9e6bbd6..1cb3e1f616a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,23 +1,37 @@ # # Architectures that offer an FTRACE implementation should select HAVE_FTRACE: # + +config NOP_TRACER + bool + config HAVE_FTRACE bool + select NOP_TRACER config HAVE_DYNAMIC_FTRACE bool +config HAVE_FTRACE_MCOUNT_RECORD + bool + config TRACER_MAX_TRACE bool +config RING_BUFFER + bool + config TRACING bool select DEBUG_FS + select RING_BUFFER select STACKTRACE + select TRACEPOINTS config FTRACE bool "Kernel Function Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select FRAME_POINTER select TRACING select CONTEXT_SWITCH_TRACER @@ -36,6 +50,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -59,6 +74,7 @@ config PREEMPT_TRACER depends on GENERIC_TIME depends on PREEMPT depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -86,6 +102,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -96,16 +113,56 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select MARKERS help This tracer gets called from the context switch and records all switching of tasks. +config BOOT_TRACER + bool "Trace boot initcalls" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select TRACING + help + This tracer helps developers to optimize boot times: it records + the timings of the initcalls and traces key events and the identity + of tasks that can cause boot delays, such as context-switches. + + Its aim is to be parsed by the /scripts/bootgraph.pl tool to + produce pretty graphics about boot inefficiencies, giving a visual + representation of the delays during initcalls - but the raw + /debug/tracing/trace text output is readable too. + + ( Note that tracing self tests can't be enabled if this tracer is + selected, because the self-tests are an initcall as well and that + would invalidate the boot trace. ) + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select FTRACE + select STACKTRACE + help + This special tracer records the maximum stack footprint of the + kernel and displays it in debugfs/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. Because this logic has to execute in every + kernel function, all the time, this option can slow down the + kernel measurably and is generally intended for kernel + developers only. + + Say N if unsure. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE depends on HAVE_DYNAMIC_FTRACE + depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -121,12 +178,17 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + config FTRACE_SELFTEST bool config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 71d17de1728..a85dfba88ba 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o endif obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o @@ -19,6 +20,9 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_BOOT_TRACER) += trace_boot.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6e3af31b40..4dda4f60a2a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -81,7 +81,7 @@ void clear_ftrace_function(void) static int __register_ftrace_function(struct ftrace_ops *ops) { - /* Should never be called by interrupts */ + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); ops->next = ftrace_list; @@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) struct ftrace_ops **p; int ret = 0; + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); /* @@ -153,6 +154,30 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +/* + * The hash lock is only needed when the recording of the mcount + * callers are dynamic. That is, by the caller themselves and + * not recorded via the compilation. + */ +static DEFINE_SPINLOCK(ftrace_hash_lock); +#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) \ + spin_unlock_irqrestore(&ftrace_hash_lock, flags) +#else +/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ +#define ftrace_hash_lock(flags) do { (void)(flags); } while (0) +#define ftrace_hash_unlock(flags) do { } while(0) +#endif + +/* + * Since MCOUNT_ADDR may point to mcount itself, we do not want + * to get it confused by reading a reference in the code as we + * are parsing on objcopy output of text. Use a variable for + * it instead. + */ +static unsigned long mcount_addr = MCOUNT_ADDR; + static struct task_struct *ftraced_task; enum { @@ -171,7 +196,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); static DEFINE_MUTEX(ftrace_regex_lock); @@ -294,13 +318,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node) static void ftrace_free_rec(struct dyn_ftrace *rec) { - /* no locking, only called from kstop_machine */ - rec->ip = (unsigned long)ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } +void ftrace_release(void *start, unsigned long size) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = s + size; + int i; + + if (ftrace_disabled || !start) + return; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if ((rec->ip >= s) && (rec->ip < e)) + ftrace_free_rec(rec); + } + } + spin_unlock(&ftrace_lock); + +} + static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -338,7 +386,6 @@ ftrace_record_ip(unsigned long ip) unsigned long flags; unsigned long key; int resched; - int atomic; int cpu; if (!ftrace_enabled || ftrace_disabled) @@ -368,9 +415,7 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out; - atomic = irqs_disabled(); - - spin_lock_irqsave(&ftrace_shutdown_lock, flags); + ftrace_hash_lock(flags); /* This ip may have hit the hash before the lock */ if (ftrace_ip_in_hash(ip, key)) @@ -387,7 +432,7 @@ ftrace_record_ip(unsigned long ip) ftraced_trigger = 1; out_unlock: - spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + ftrace_hash_unlock(flags); out: per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -531,6 +576,16 @@ static void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + static int ftrace_code_disable(struct dyn_ftrace *rec) { @@ -541,10 +596,27 @@ ftrace_code_disable(struct dyn_ftrace *rec) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, MCOUNT_ADDR); + call = ftrace_call_replace(ip, mcount_addr); failed = ftrace_modify_code(ip, call, nop); if (failed) { + switch (failed) { + case 1: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case 2: + WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" expected: ", call); + print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" replace: ", nop); + printk(KERN_CONT "\n"); + break; + } + rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -792,47 +864,7 @@ static int ftrace_update_code(void) return 1; } -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dyn_table_alloc(void) +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) { struct ftrace_page *pg; int cnt; @@ -859,7 +891,9 @@ static int __init ftrace_dyn_table_alloc(void) pg = ftrace_pages = ftrace_pages_start; - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld hash entries in %d pages\n", + num_to_init, cnt); for (i = 0; i < cnt; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -901,6 +935,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); retry: if (iter->idx >= iter->pg->index) { if (iter->pg->next) { @@ -910,15 +946,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((!(iter->flags & FTRACE_ITER_FAILURES) && + if ((rec->flags & FTRACE_FL_FREE) || + + (!(iter->flags & FTRACE_ITER_FAILURES) && (rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_FAILURES) && - (!(rec->flags & FTRACE_FL_FAILED) || - (rec->flags & FTRACE_FL_FREE))) || - - ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { @@ -926,6 +960,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) goto retry; } } + spin_unlock(&ftrace_lock); iter->pos = *pos; @@ -1039,8 +1074,8 @@ static void ftrace_filter_reset(int enable) unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 0; pg = ftrace_pages_start; @@ -1053,7 +1088,7 @@ static void ftrace_filter_reset(int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static int @@ -1165,8 +1200,8 @@ ftrace_match(unsigned char *buff, int len, int enable) } } - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 1; pg = ftrace_pages_start; @@ -1203,7 +1238,7 @@ ftrace_match(unsigned char *buff, int len, int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static ssize_t @@ -1556,6 +1591,114 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +static int ftrace_convert_nops(unsigned long *start, + unsigned long *end) +{ + unsigned long *p; + unsigned long addr; + unsigned long flags; + + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + ftrace_record_ip(addr); + spin_unlock(&ftrace_lock); + ftrace_shutdown_replenish(); + } + + /* p is ignored */ + local_irq_save(flags); + __ftrace_update_code(p); + local_irq_restore(flags); + + return 0; +} + +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ + if (ftrace_disabled || start == end) + return; + ftrace_convert_nops(start, end); +} + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ + unsigned long count, addr, flags; + int ret; + + /* Keep the ftrace pointer to the stub */ + addr = (unsigned long)ftrace_stub; + + local_irq_save(flags); + ftrace_dyn_arch_init(&addr); + local_irq_restore(flags); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + + ret = ftrace_dyn_table_alloc(count); + if (ret) + goto failed; + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_convert_nops(__start_mcount_loc, + __stop_mcount_loc); + + return; + failed: + ftrace_disabled = 1; +} +#else /* CONFIG_FTRACE_MCOUNT_RECORD */ +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + static int __init ftrace_dynamic_init(void) { struct task_struct *p; @@ -1572,7 +1715,7 @@ static int __init ftrace_dynamic_init(void) goto failed; } - ret = ftrace_dyn_table_alloc(); + ret = ftrace_dyn_table_alloc(NR_TO_INIT); if (ret) goto failed; @@ -1593,6 +1736,8 @@ static int __init ftrace_dynamic_init(void) } core_initcall(ftrace_dynamic_init); +#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ + #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 00000000000..94af1fe56bb --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,2014 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> /* used for sched_clock() (for now) */ +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/fs.h> + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +/* FIXME!!! */ +u64 ring_buffer_time_stamp(int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return sched_clock() << DEBUG_SHIFT; +} + +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} + +#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_ALIGNMENT_SHIFT 2 +#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +#define RB_MAX_SMALL_DATA 28 + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +/* inline for ring buffer fast paths */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + /* undefined */ + return -1; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + if (event->len) + length = event->len << RB_ALIGNMENT_SHIFT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; + default: + BUG(); + } + /* not hit */ + return 0; +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + return rb_event_length(event); +} + +/* inline for ring buffer fast paths */ +static inline void * +rb_event_data(struct ring_buffer_event *event) +{ + BUG_ON(event->type != RINGBUF_TYPE_DATA); + /* If length is in len field, then array[0] has the data */ + if (event->len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu_mask(cpu, buffer->cpumask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * This hack stolen from mm/slob.c. + * We can store per page timing information in the page frame of the page. + * Thanks to Peter Zijlstra for suggesting this idea. + */ +struct buffer_page { + u64 time_stamp; /* page time stamp */ + local_t write; /* index for next write */ + local_t commit; /* write commited index */ + unsigned read; /* index for next read */ + struct list_head list; /* list of free pages */ + void *page; /* Actual data page */ +}; + +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static inline void free_buffer_page(struct buffer_page *bpage) +{ + if (bpage->page) + __free_page(bpage->page); + kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + +#define BUF_PAGE_SIZE PAGE_SIZE + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + struct ring_buffer *buffer; + spinlock_t lock; + struct lock_class_key lock_key; + struct list_head pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* commited pages */ + struct buffer_page *reader_page; + unsigned long overrun; + unsigned long entries; + u64 write_stamp; + u64 read_stamp; + atomic_t record_disabled; +}; + +struct ring_buffer { + unsigned long size; + unsigned pages; + unsigned flags; + int cpus; + cpumask_t cpumask; + atomic_t record_disabled; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + struct buffer_page *head_page; + u64 read_stamp; +}; + +#define RB_WARN_ON(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +#define RB_WARN_ON_RET(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return -1; \ + } \ + } while (0) + +#define RB_WARN_ON_ONCE(buffer, cond) \ + do { \ + static int once; \ + if (unlikely(cond) && !once) { \ + once++; \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +/** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); + RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + + list_for_each_entry_safe(page, tmp, head, list) { + RB_WARN_ON_RET(cpu_buffer, + page->list.next->prev != &page->list); + RB_WARN_ON_RET(cpu_buffer, + page->list.prev->next != &page->list); + } + + return 0; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + unsigned long addr; + LIST_HEAD(pages); + unsigned i; + + for (i = 0; i < nr_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page->page = (void *)addr; + } + + list_splice(&pages, head); + + rb_check_pages(cpu_buffer); + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + return -ENOMEM; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *page; + unsigned long addr; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->lock); + INIT_LIST_HEAD(&cpu_buffer->pages); + + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto fail_free_buffer; + + cpu_buffer->reader_page = page; + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto fail_free_reader; + page->page = (void *)addr; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + + ret = rb_allocate_pages(cpu_buffer, buffer->pages); + if (ret < 0) + goto fail_free_reader; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + return cpu_buffer; + + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + list_del_init(&cpu_buffer->reader_page->list); + free_buffer_page(cpu_buffer->reader_page); + + list_for_each_entry_safe(page, tmp, head, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + kfree(cpu_buffer); +} + +/* + * Causes compile errors if the struct buffer_page gets bigger + * than the struct page. + */ +extern int ring_buffer_page_too_big(void); + +/** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +{ + struct ring_buffer *buffer; + int bsize; + int cpu; + + /* Paranoid! Optimizes out when all is well */ + if (sizeof(struct buffer_page) > sizeof(struct page)) + ring_buffer_page_too_big(); + + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + + /* need at least two pages */ + if (buffer->pages == 1) + buffer->pages++; + + buffer->cpumask = cpu_possible_map; + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_buffer; + + for_each_buffer_cpu(buffer, cpu) { + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + } + + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_buffer: + kfree(buffer); + return NULL; +} + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + + kfree(buffer); +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static void +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(&cpu_buffer->pages)); + p = cpu_buffer->pages.next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + free_buffer_page(page); + } + BUG_ON(list_empty(&cpu_buffer->pages)); + + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + +} + +static void +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *pages, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(pages)); + p = pages->next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + list_add_tail(&page->list, &cpu_buffer->pages); + } + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * + * The tracer is responsible for making sure that the buffer is + * not being used while changing the size. + * Note: We may be able to change the above requirement by using + * RCU synchronizations. + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns -1 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned nr_pages, rm_pages, new_pages; + struct buffer_page *page, *tmp; + unsigned long buffer_size; + unsigned long addr; + LIST_HEAD(pages); + int i, cpu; + + size = DIV_ROUND_UP(size, BUF_P |