diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/blktrace.c | 16 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 52 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 24 | ||||
-rw-r--r-- | kernel/trace/trace.c | 38 | ||||
-rw-r--r-- | kernel/trace/trace.h | 41 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 6 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 885 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 111 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 36 | ||||
-rw-r--r-- | kernel/trace/trace_sched_switch.c | 48 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 42 |
12 files changed, 1009 insertions, 292 deletions
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d95721f3370..cbafed7d4f3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1827,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } -void blk_fill_rwbs_rq(char *rwbs, struct request *rq) -{ - int rw = rq->cmd_flags & 0x03; - int bytes; - - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - - bytes = blk_rq_bytes(rq); - - blk_fill_rwbs(rwbs, rw, bytes); -} - #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae8388..888b611897d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6ee56b4ad13..d9c8bcafb12 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,7 +5,6 @@ */ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> -#include <linux/ftrace_irq.h> #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/uaccess.h> @@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { @@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb8058..9541c27c1cf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,8 +41,6 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -425,6 +423,7 @@ static const char *trace_options[] = { "sleep-time", "graph-time", "record-cmd", + "overwrite", NULL }; @@ -780,6 +779,11 @@ __acquires(kernel_lock) tracing_reset_online_cpus(tr); current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); @@ -792,6 +796,10 @@ __acquires(kernel_lock) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + printk(KERN_CONT "PASSED\n"); } #endif @@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| /_--=> lock-depth \n"); - seq_puts(m, "# |||||/ delay \n"); - seq_puts(m, "# cmd pid |||||| time | caller \n"); - seq_puts(m, "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); } static ssize_t @@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + if (val) { tracer_enabled = 1; if (current_trace->start) @@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) __init static int tracer_alloc_buffers(void) { int ring_buf_size; + enum ring_buffer_flags rb_flags; int i; int ret = -ENOMEM; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); @@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, rb_flags); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c..5e9dfc6286d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,8 +272,8 @@ struct tracer { /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; - int print_max; struct tracer_flags *flags; + int print_max; int use_max_tr; }; @@ -606,6 +606,7 @@ enum trace_iterator_flags { TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, }; /* @@ -661,8 +662,10 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; - struct filter_pred **preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ + struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -674,11 +677,23 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); @@ -700,11 +715,23 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be..1516cb3ec54 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, */ #define FTRACE_CTX_FIELDS \ __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ __field( unsigned char, prev_prio ) \ __field( unsigned char, prev_state ) \ - __field( unsigned int, next_pid ) \ __field( unsigned char, next_prio ) \ - __field( unsigned char, next_state ) \ - __field( unsigned int, next_cpu ) + __field( unsigned char, next_state ) FTRACE_ENTRY(context_switch, ctx_switch_entry, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a..e88f74fe1d4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, lock_depth); return ret; } @@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) { return __ftrace_set_clr_event(NULL, system, event, set); } +EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17..3249b4f77ef 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,9 +123,13 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; + struct filter_pred *preds; struct filter_pred *pred; - int i; + struct filter_pred *root; + int n_preds; + int done = 0; + + /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + + if (!n_preds) + return 1; + + /* + * n_preds, root and filter->preds are protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; + + pred = root; - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else { + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + + return __push_pred_stack(stack, dest); } -static void filter_disable_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; - call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void filter_disable(struct ftrace_event_call *call) { - int i; + call->flags &= ~TRACE_EVENT_FL_FILTERED; +} +static void __free_filter(struct event_filter *filter) +{ if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ struct filter_pred *pred; int i; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); + if (filter->preds) + __free_preds(filter); - filter->n_preds = 0; + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!filter->preds) - goto oom; + return -ENOMEM; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + filter->a_preds = n_preds; + filter->n_preds = 0; + + for (i = 0; i < n_preds; i++) { + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } - return filter; - -oom: - __free_preds(filter); - return ERR_PTR(-ENOMEM); -} - -static int init_preds(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - return 0; } -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; - fn = filter_pred_and; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; - fn = filter_pred_or; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed t |