diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-07-21 09:32:40 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-07-21 09:32:40 +0200 |
commit | 40bcea7bbe8fe452a2d272e2ffd3dea281eec9ff (patch) | |
tree | aedb6d02e53e3cf84cc32fd81db84032cee205e1 | |
parent | 492f73a303b488ffd67097b2351d54aa6e6c7c73 (diff) | |
parent | 14a8fd7ceea6915c613746203d6e9a2bf273f16c (diff) |
Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/core
26 files changed, 1663 insertions, 1003 deletions
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index c83bd6b4e6e..d0d0bb9e3e2 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -22,14 +22,15 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events ------------------------- - p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe + p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT : Clear a probe GRP : Group name. If omitted, use "kprobes" for it. EVENT : Event name. If omitted, the event name is generated - based on SYMBOL+offs or MEMADDR. - SYMBOL[+offs] : Symbol+offset where the probe is inserted. + based on SYM+offs or MEMADDR. + MOD : Module name which has given SYM. + SYM[+offs] : Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. FETCHARGS : Arguments. Each probe can have up to 128 args. diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 56fd9e3abbd..4d86c86178f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -102,6 +102,14 @@ #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) /* + * If an event has alias it should be marked + * with a special bit. (Don't forget to check + * P4_PEBS_CONFIG_MASK and related bits on + * modification.) + */ +#define P4_CONFIG_ALIASABLE (1 << 9) + +/* * The bits we allow to pass for RAW events */ #define P4_CONFIG_MASK_ESCR \ @@ -123,6 +131,31 @@ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) +/* + * In case of event aliasing we need to preserve some + * caller bits otherwise the mapping won't be complete. + */ +#define P4_CONFIG_EVENT_ALIAS_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ + p4_config_pack_cccr(P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE)) + +#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \ + ((P4_CONFIG_HT) | \ + p4_config_pack_escr(P4_ESCR_T0_OS | \ + P4_ESCR_T0_USR | \ + P4_ESCR_T1_OS | \ + P4_ESCR_T1_USR) | \ + p4_config_pack_cccr(P4_CCCR_OVF | \ + P4_CCCR_CASCADE | \ + P4_CCCR_FORCE_OVF | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_OVF_PMI_T0 | \ + P4_CCCR_OVF_PMI_T1 | \ + P4_CONFIG_ALIASABLE)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c53d433c3dd..b7a010fce8c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -274,7 +274,6 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); - void (*hw_watchdog_set_attr)(struct perf_event_attr *attr); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -360,12 +359,6 @@ static u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -void hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - if (x86_pmu.hw_watchdog_set_attr) - x86_pmu.hw_watchdog_set_attr(wd_attr); -} - /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index fb901c5080f..8b7a0c30678 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -570,11 +570,92 @@ static __initconst const u64 p4_hw_cache_event_ids }, }; +/* + * Because of Netburst being quite restricted in now + * many same events can run simultaneously, we use + * event aliases, ie different events which have the + * same functionallity but use non-intersected resources + * (ESCR/CCCR/couter registers). This allow us to run + * two or more semi-same events together. It is done + * transparently to a user space. + * + * Never set any cusom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up-to-dated automatically either not appliable + * at all. + * + * And be really carefull choosing aliases! + */ +struct p4_event_alias { + u64 orig; + u64 alter; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with + * non-sleeping cycles (see Intel SDM Vol3b for + * details). + */ + .orig = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alter = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Probably we're lucky and don't have to do + * matching over all config bits. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + /* + * If an event was previously swapped to the alter config + * we should swap it back otherwise contnention on registers + * will return back. + */ + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].orig) { + config_match = p4_event_aliases[i].alter; + break; + } else if (config_match == p4_event_aliases[i].alter) { + config_match = p4_event_aliases[i].orig; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | + (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + static u64 p4_general_events[PERF_COUNT_HW_MAX] = { /* non-halted CPU clocks */ [PERF_COUNT_HW_CPU_CYCLES] = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, /* * retired instructions @@ -719,31 +800,6 @@ static int p4_validate_raw_event(struct perf_event *event) return 0; } -static void p4_hw_watchdog_set_attr(struct perf_event_attr *wd_attr) -{ - /* - * Watchdog ticks are special on Netburst, we use - * that named "non-sleeping" ticks as recommended - * by Intel SDM Vol3b. - */ - WARN_ON_ONCE(wd_attr->type != PERF_TYPE_HARDWARE || - wd_attr->config != PERF_COUNT_HW_CPU_CYCLES); - - wd_attr->type = PERF_TYPE_RAW; - wd_attr->config = - p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | - P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3)) | - p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | - P4_CCCR_COMPARE); -} - static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); @@ -1159,6 +1215,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign struct p4_event_bind *bind; unsigned int i, thread, num; int cntr_idx, escr_idx; + u64 config_alias; + int pass; bitmap_zero(used_mask, X86_PMC_IDX_MAX); bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); @@ -1167,6 +1225,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign hwc = &cpuc->event_list[i]->hw; thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * Aliases are swappable so we may hit circular + * lock if both original config and alias need + * resources (MSR registers) which already busy. + */ + if (pass > 2) + goto done; + bind = p4_config_get_bind(hwc->config); escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); if (unlikely(escr_idx == -1)) @@ -1180,8 +1249,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign } cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) - goto done; + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Probably an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } p4_pmu_swap_config_ts(hwc, cpu); if (assign) @@ -1218,7 +1296,6 @@ static __initconst const struct x86_pmu p4_pmu = { .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, - .hw_watchdog_set_attr = p4_hw_watchdog_set_attr, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index b1e69eefc20..96efa6794ea 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -76,6 +76,7 @@ struct trace_iterator { struct trace_entry *ent; unsigned long lost_events; int leftover; + int ent_size; int cpu; u64 ts; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 77981813a1e..b30fd54eb98 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr) /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. + * This returns encoded errors if it fails to look up symbol or invalid + * combination of parameters. */ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) { kprobe_opcode_t *addr = p->addr; + + if ((p->symbol_name && p->addr) || + (!p->symbol_name && !p->addr)) + goto invalid; + if (p->symbol_name) { - if (addr) - return NULL; kprobe_lookup_name(p->symbol_name, addr); + if (!addr) + return ERR_PTR(-ENOENT); } - if (!addr) - return NULL; - return (kprobe_opcode_t *)(((char *)addr) + p->offset); + addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); + if (addr) + return addr; + +invalid: + return ERR_PTR(-EINVAL); } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ @@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p) kprobe_opcode_t *addr; addr = kprobe_addr(p); - if (!addr) - return -EINVAL; + if (IS_ERR(addr)) + return PTR_ERR(addr); p->addr = addr; ret = check_kprobe_rereg(p); @@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p) */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { + /* Return -ENOENT if fail. */ + ret = -ENOENT; /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. @@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) module_put(probed_mod); goto fail_with_jump_label; } + /* ret will be updated by following code */ } preempt_enable(); jump_label_unlock(); @@ -1399,7 +1412,7 @@ out: fail_with_jump_label: preempt_enable(); jump_label_unlock(); - return -EINVAL; + return ret; } EXPORT_SYMBOL_GPL(register_kprobe); @@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); - if (!addr) - return -EINVAL; + if (IS_ERR(addr)) + return PTR_ERR(addr); for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a0e246e2cee..c3e4575e782 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -88,6 +88,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -146,9 +147,11 @@ void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; __ftrace_trace_function = ftrace_stub; + __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } +#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST /* * For those archs that do not test ftrace_trace_stop in their @@ -208,7 +211,12 @@ static void update_ftrace_function(void) #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; #else +#ifdef CONFIG_DYNAMIC_FTRACE + /* do not update till all functions have been modified */ + __ftrace_trace_function_delay = func; +#else __ftrace_trace_function = func; +#endif ftrace_trace_function = ftrace_test_stop_func; #endif } @@ -1170,8 +1178,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; } +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + static int -ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; @@ -1181,9 +1195,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) unsigned long key; int size = src->count; int bits = 0; + int ret; int i; /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + + /* * If the new source is empty, just free dst and assign it * the empty_hash. */ @@ -1203,9 +1224,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; + ret = -ENOMEM; new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + goto out; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1224,7 +1246,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - return 0; + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; } /* @@ -1584,6 +1615,12 @@ static int __ftrace_modify_code(void *data) { int *command = data; + /* + * Do not call function tracer while we update the code. + * We are in stop machine, no worrying about races. + */ + function_trace_stop++; + if (*command & FTRACE_ENABLE_CALLS) ftrace_replace_code(1); else if (*command & FTRACE_DISABLE_CALLS) @@ -1597,6 +1634,18 @@ static int __ftrace_modify_code(void *data) else if (*command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + /* + * For archs that call ftrace_test_stop_func(), we must + * wait till after we update all the function callers + * before we update the callback. This keeps different + * ops that record different functions from corrupting + * each other. + */ + __ftrace_trace_function = __ftrace_trace_function_delay; +#endif + function_trace_stop--; + return 0; } @@ -2865,7 +2914,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ftrace_match_records(hash, buf, len); mutex_lock(&ftrace_lock); - ret = ftrace_hash_move(orig_hash, hash); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_regex_lock); @@ -3048,18 +3101,12 @@ ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); - /* - * Remove the current set, update the hash and add - * them back. - */ - ftrace_hash_rec_disable(iter->ops, filter_hash); - ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret) { - ftrace_hash_rec_enable(iter->ops, filter_hash); - if (iter->ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - } + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); @@ -3338,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; + unsigned long flags = 0; /* Shut up gcc */ mutex_lock(&ftrace_lock); p = start; @@ -3356,12 +3403,18 @@ static int ftrace_process_locs(struct module *mod, } /* - * Disable interrupts to prevent interrupts from executing - * code that is being modified. + * We only need to disable interrupts on start up + * because we are modifying code that an interrupt + * may execute, and the modification is not atomic. + * But for modules, nothing runs the code we modify + * until we are finished with it, and there's no + * reason to cause large interrupt latencies while we do it. */ - local_irq_save(flags); + if (!mod) + local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); + if (!mod) + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9c16123f6e..e5df02c69b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1248,6 +1248,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, } #ifdef CONFIG_STACKTRACE + +#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +struct ftrace_stack { + unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; +}; + +static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); + static void __ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc, struct pt_regs *regs) @@ -1256,25 +1265,77 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; + int use_stack; + int size = FTRACE_STACK_ENTRIES; + + trace.nr_entries = 0; + trace.skip = skip; + + /* + * Since events can happen in NMIs there's no safe way to + * use the per cpu ftrace_stacks. We reserve it and if an interrupt + * or NMI comes in, it will just have to use the default + * FTRACE_STACK_SIZE. + */ + preempt_disable_notrace(); + + use_stack = ++__get_cpu_var(ftrace_stack_reserve); + /* + * We don't need any atomic variables, just a barrier. + * If an interrupt comes in, we don't care, because it would + * have exited and put the counter back to what we want. + * We just need a barrier to keep gcc from moving things + * around. + */ + barrier(); + if (use_stack == 1) { + trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.max_entries = FTRACE_STACK_MAX_ENTRIES; + + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + + if (trace.nr_entries > size) + size = trace.nr_entries; + } else + /* From now on, use_stack is a boolean */ + use_stack = 0; + + size *= sizeof(unsigned long); event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry), flags, pc); + sizeof(*entry) + size, flags, pc); if (!event) - return; - entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, sizeof(entry->caller)); + goto out; + entry = ring_buffer_event_data(event); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->caller; + memset(&entry->caller, 0, size); + + if (use_stack) + memcpy(&entry->caller, trace.entries, + trace.nr_entries * sizeof(unsigned long)); + else { + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.entries = entry->caller; + if (regs) + save_stack_trace_regs(regs, &trace); + else + save_stack_trace(&trace); + } + + entry->size = trace.nr_entries; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + out: + /* Again, don't let gcc optimize things here */ + barrier(); + __get_cpu_var(ftrace_stack_reserve)--; + preempt_enable_notrace(); + } void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, @@ -1562,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, ftrace_enable_cpu(); - return event ? ring_buffer_event_data(event) : NULL; + if (event) { + iter->ent_size = ring_buffer_event_length(event); + return ring_buffer_event_data(event); + } + iter->ent_size = 0; + return NULL; } static struct trace_entry * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 30a94c26dcb..3f381d0b20a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -278,6 +278,29 @@ struct tracer { }; +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ +#define TRACE_IRQ_BIT (1<<13) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + #define TRACE_PIPE_ALL_CPU -1 int tracer_init(struct tracer *t, struct trace_array *tr); @@ -516,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) + if (addr == ftrace_graph_funcs[i]) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); return 1; + } } return 0; @@ -795,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" -/* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) - -/* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) - #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e32744c84d9..93365907f21 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, F_STRUCT( - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __field( int, size ) + __dynamic_array(unsigned long, caller ) ), F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e8d6bb55d71..a7d2a4c653d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -227,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr, static inline int ftrace_graph_ignore_irqs(void) { - if (!ftrace_graph_skip_irqs) + if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; return in_irq(); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7db7b68c6c3..5fb3697bf0e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) DEFINE_FETCH_deref(string_size) +static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) @@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string_size NULL static __kprobes void +update_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + update_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + update_symbol_cache(data->orig.data); +} + +static __kprobes void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -389,6 +410,7 @@ free_bi |