From bfdb4d9f0f611687d71cf6a460efc9e755f4a462 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Tue, 23 Jun 2009 10:00:58 +0530 Subject: timers: Fix timer_migration interface which accepts any number as input Poornima Nayek reported: | Timer migration interface /proc/sys/kernel/timer_migration in | 2.6.30-git9 accepts any numerical value as input. | | Steps to reproduce: | 1. echo -6666666 > /proc/sys/kernel/timer_migration | 2. cat /proc/sys/kernel/timer_migration | -6666666 | | 1. echo 44444444444444444444444444444444444444444444444444444444444 > /proc/sys/kernel/timer_migration | 2. cat /proc/sys/kernel/timer_migration | -1357789412 | | Expected behavior: Should 'echo: write error: Invalid argument' while | setting any value other then 0 & 1 Restrict valid values to 0 and 1. Reported-by: Poornima Nayak Tested-by: Poornima Nayak Signed-off-by: Arun R Bharadwaj Cc: poornima nayak Cc: Arun Bharadwaj LKML-Reference: <20090623043058.GA3249@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b..c428ba161db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -335,7 +335,10 @@ static struct ctl_table kern_table[] = { .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, }, #endif { -- cgit v1.2.3-18-g5258 From f29ac756a40d0f1bb07d682ea521e7b666ff06d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jun 2009 18:27:26 +0200 Subject: perf_counter: Optimize perf_swcounter_event() Similar to tracepoints, use an enable variable to reduce overhead when unused. Only look for a counter of a particular event type when we know there is at least one in the system. Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea..7515c769542 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3317,8 +3317,8 @@ out: put_cpu_var(perf_cpu_context); } -void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +void __perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { .regs = regs, @@ -3509,9 +3509,19 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) } #endif +atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_counter_destroy(struct perf_counter *counter) +{ + u64 event = counter->attr.config; + + atomic_dec(&perf_swcounter_enabled[event]); +} + static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { const struct pmu *pmu = NULL; + u64 event = counter->attr.config; /* * Software counters (currently) can't in general distinguish @@ -3520,7 +3530,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->attr.config) { + switch (event) { case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3541,6 +3551,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + atomic_inc(&perf_swcounter_enabled[event]); + counter->destroy = sw_perf_counter_destroy; pmu = &perf_ops_generic; break; } -- cgit v1.2.3-18-g5258 From b84fbc9fb1d943e2c5f4efe52ed0e3c93a4bdb6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 13:57:40 +0200 Subject: perf_counter: Push inherit into perf_counter_alloc() Teach perf_counter_alloc() about inheritance so that we can optimize the inherit path in the next patch. Remove the child_counter->atrr.inherit = 1 line because the only way to get there is if parent_counter->attr.inherit == 1 and we copy the attrs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7515c769542..0a45490f402 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3568,6 +3568,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, int cpu, struct perf_counter_context *ctx, struct perf_counter *group_leader, + struct perf_counter *parent_counter, gfp_t gfpflags) { const struct pmu *pmu; @@ -3603,6 +3604,8 @@ perf_counter_alloc(struct perf_counter_attr *attr, counter->ctx = ctx; counter->oncpu = -1; + counter->parent = parent_counter; + counter->ns = get_pid_ns(current->nsproxy->pid_ns); counter->id = atomic64_inc_return(&perf_counter_id); @@ -3827,7 +3830,7 @@ SYSCALL_DEFINE5(perf_counter_open, } counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - GFP_KERNEL); + NULL, GFP_KERNEL); ret = PTR_ERR(counter); if (IS_ERR(counter)) goto err_put_context; @@ -3893,7 +3896,8 @@ inherit_counter(struct perf_counter *parent_counter, child_counter = perf_counter_alloc(&parent_counter->attr, parent_counter->cpu, child_ctx, - group_leader, GFP_KERNEL); + group_leader, parent_counter, + GFP_KERNEL); if (IS_ERR(child_counter)) return child_counter; get_ctx(child_ctx); @@ -3916,12 +3920,6 @@ inherit_counter(struct perf_counter *parent_counter, */ add_counter_to_ctx(child_counter, child_ctx); - child_counter->parent = parent_counter; - /* - * inherit into child's child as well: - */ - child_counter->attr.inherit = 1; - /* * Get a reference to the parent filp - we will fput it * when the child counter exits. This is safe to do because -- cgit v1.2.3-18-g5258 From f344011ccb85469445369153c3d27c4ee4bc2ac8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 13:58:35 +0200 Subject: perf_counter: Optimize perf_counter_alloc()'s inherit case We don't need to add usage counts for swcounter and attr usage models for inherited counters since the parent counter will always have one, which suffices to generate the needed output. This avoids up to 3 global atomic increments per inherited counter. LKML-Reference: Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0a45490f402..c2b19c11171 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1508,11 +1508,13 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); + if (!counter->parent) { + atomic_dec(&nr_counters); + if (counter->attr.mmap) + atomic_dec(&nr_mmap_counters); + if (counter->attr.comm) + atomic_dec(&nr_comm_counters); + } if (counter->destroy) counter->destroy(counter); @@ -3515,6 +3517,8 @@ static void sw_perf_counter_destroy(struct perf_counter *counter) { u64 event = counter->attr.config; + WARN_ON(counter->parent); + atomic_dec(&perf_swcounter_enabled[event]); } @@ -3551,8 +3555,10 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; + if (!counter->parent) { + atomic_inc(&perf_swcounter_enabled[event]); + counter->destroy = sw_perf_counter_destroy; + } pmu = &perf_ops_generic; break; } @@ -3663,11 +3669,13 @@ done: counter->pmu = pmu; - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); + if (!counter->parent) { + atomic_inc(&nr_counters); + if (counter->attr.mmap) + atomic_inc(&nr_mmap_counters); + if (counter->attr.comm) + atomic_inc(&nr_comm_counters); + } return counter; } -- cgit v1.2.3-18-g5258 From e1c7e2a6e67fe9db19dd15e71614526a31b5fdb1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:52:29 +0800 Subject: tracing/events: Don't increment @pos in s_start() While testing syscall tracepoints posted by Jason, I found 3 entries were missing when reading available_events. The output size of available_events is < 4 pages, which means we lost 1 entry per page. The cause is, it's wrong to increment @pos in s_start(). Actually there's another bug here -- reading avaiable_events/set_events can race with module unload: # cat available_events | s_start() | s_stop() | | # rmmod foo.ko s_start() | call = list_entry(m->private) | @call might be freed and accessing it will lead to crash. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4186DD.6090405@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..53c8fd376a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -300,10 +300,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return t_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = t_next(m, NULL, &l); + if (!call) + break; + } + return call; } static void * @@ -332,10 +340,18 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + struct ftrace_event_call *call = NULL; + loff_t l; + mutex_lock(&event_mutex); - if (*pos == 0) - m->private = ftrace_events.next; - return s_next(m, NULL, pos); + + m->private = ftrace_events.next; + for (l = 0; l <= *pos; ) { + call = s_next(m, NULL, &l); + if (!call) + break; + } + return call; } static int t_show(struct seq_file *m, void *v) -- cgit v1.2.3-18-g5258 From c8961ec6da22ea010bf4470a8e0fb3fdad0f11c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:52:58 +0800 Subject: tracing_bprintk: Don't increment @pos in t_start() It's wrong to increment @pos in t_start(), otherwise we'll lose some entries when reading printk_formats, if the output is larger than PAGE_SIZE. Reported-by: Lai Jiangshan Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4186FA.1020106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_printk.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 9bece9687b6..7b627811082 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -155,25 +155,19 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) EXPORT_SYMBOL_GPL(__ftrace_vprintk); static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = m->private; - const char **next = fmt; - - (*pos)++; + const char **fmt = __start___trace_bprintk_fmt + *pos; if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) return NULL; - - next = fmt; - m->private = ++next; - return fmt; } -static void *t_start(struct seq_file *m, loff_t *pos) +static void *t_next(struct seq_file *m, void * v, loff_t *pos) { - return t_next(m, NULL, pos); + (*pos)++; + return t_start(m, pos); } static int t_show(struct seq_file *m, void *v) @@ -224,15 +218,7 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { - int ret; - - ret = seq_open(file, &show_format_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = __start___trace_bprintk_fmt; - } - return ret; + return seq_open(file, &show_format_seq_ops); } static const struct file_operations ftrace_formats_fops = { -- cgit v1.2.3-18-g5258 From 2961bf345fd1b736c3db46cad0f69855f67fbe9c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:53:26 +0800 Subject: trace_stat: Don't increment @pos in seq start() It's wrong to increment @pos in stat_seq_start(). It causes some stat entries lost when reading stat file, if the output of the file is larger than PAGE_SIZE. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418716.90209@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index c00643733f4..e66f5e49334 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -199,17 +199,13 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) { - (*pos)++; + if (!*pos && session->ts->stat_headers) return SEQ_START_TOKEN; - } node = rb_first(&session->stat_root); for (i = 0; node && i < *pos; i++) node = rb_next(node); - (*pos)++; - return node; } -- cgit v1.2.3-18-g5258 From f129e965bef40c6153e4fe505f1e408286213424 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:53:44 +0800 Subject: tracing: Reset iterator in t_start() The iterator is m->private, but it's not reset to trace_types in t_start(). If the output is larger than PAGE_SIZE and t_start() is called the 2nd time, things will go wrong. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418728.5020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 076fa6f0ee4..3bb31006b5c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2053,25 +2053,23 @@ static int tracing_open(struct inode *inode, struct file *file) static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t = v; (*pos)++; if (t) t = t->next; - m->private = t; - return t; } static void *t_start(struct seq_file *m, loff_t *pos) { - struct tracer *t = m->private; + struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (; t && l < *pos; t = t_next(m, t, &l)) + for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) ; return t; @@ -2107,18 +2105,10 @@ static struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { - int ret; - if (tracing_disabled) return -ENODEV; - ret = seq_open(file, &show_traces_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = trace_types; - } - - return ret; + return seq_open(file, &show_traces_seq_ops); } static ssize_t -- cgit v1.2.3-18-g5258 From 85951842a1020669f0a9eb0f0d1853b41341f097 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:00 +0800 Subject: ftrace: Don't increment @pos in g_start() It's wrong to increment @pos in g_start(). It causes some entries lost when reading set_graph_function, if the output of the file is larger than PAGE_SIZE. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A418738.7090401@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3718d55fb4c..cde74b9973b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2500,32 +2500,31 @@ int ftrace_graph_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * -g_next(struct seq_file *m, void *v, loff_t *pos) +__g_next(struct seq_file *m, loff_t *pos) { unsigned long *array = m->private; - int index = *pos; - - (*pos)++; - if (index >= ftrace_graph_count) + if (*pos >= ftrace_graph_count) return NULL; + return &array[*pos]; +} - return &array[index]; +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); } static void *g_start(struct seq_file *m, loff_t *pos) { - void *p = NULL; - mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ if (!ftrace_graph_count && !*pos) return (void *)1; - p = g_next(m, p, pos); - - return p; + return __g_next(m, pos); } static void g_stop(struct seq_file *m, void *p) -- cgit v1.2.3-18-g5258 From 694ce0a544fba37a60025a6803ee6265be8a2a22 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:19 +0800 Subject: ftrace: Don't manipulate @pos in t_start() It's rather confusing that in t_start(), in some cases @pos is incremented, and in some cases it's decremented and then incremented. This patch rewrites t_start() in a much more general way. Thus we fix a bug that if ftrace_filtered == 1, functions have tracer hooks won't be printed, because the branch is always unreachable: static void *t_start(...) { ... if (!p) return t_hash_start(m, pos); return p; } Before: # echo 'sys_open' > /mnt/tracing/set_ftrace_filter # echo 'sys_write:traceon:4' >> /mnt/tracing/set_ftrace_filter sys_open After: # echo 'sys_open' > /mnt/tracing/set_ftrace_filter # echo 'sys_write:traceon:4' >> /mnt/tracing/set_ftrace_filter sys_open sys_write:traceon:count=4 Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41874B.4090507@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cde74b9973b..dc810208edd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1467,8 +1467,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) iter->pg = iter->pg->next; iter->idx = 0; goto retry; - } else { - iter->idx = -1; } } else { rec = &iter->pg->records[iter->idx++]; @@ -1497,6 +1495,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; mutex_lock(&ftrace_lock); /* @@ -1508,23 +1507,21 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; - (*pos)++; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); - if (*pos > 0) { - if (iter->idx < 0) - return p; - (*pos)--; - iter->idx--; + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; } - p = t_next(m, p, pos); - - if (!p) + if (!p && iter->flags & FTRACE_ITER_FILTER) return t_hash_start(m, pos); return p; -- cgit v1.2.3-18-g5258 From d82d62444f87e5993af2fa82ed636b2206e052ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 09:54:54 +0800 Subject: ftrace: Fix t_hash_start() When the output of set_ftrace_filter is larger than PAGE_SIZE, t_hash_start() will be called the 2nd time, and then we start from the head of a hlist, which is wrong and causes some entries to be outputed twice. The worse is, if the hlist is large enough, reading set_ftrace_filter won't stop but in a dead loop. Reviewed-by: Liming Wang Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41876E.2060407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dc810208edd..71a52c17214 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1417,10 +1417,20 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; + loff_t l; + + if (!(iter->flags & FTRACE_ITER_HASH)) + *pos = 0; iter->flags |= FTRACE_ITER_HASH; - return t_hash_next(m, p, pos); + iter->hidx = 0; + for (l = 0; l <= *pos; ) { + p = t_hash_next(m, p, &l); + if (!p) + break; + } + return p; } static int t_hash_show(struct seq_file *m, void *v) -- cgit v1.2.3-18-g5258 From 507e123151149e578c9aae33eb876c49824da5f8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Jun 2009 17:38:15 +0200 Subject: timer stats: Optimize by adding quick check to avoid function calls When the kernel is configured with CONFIG_TIMER_STATS but timer stats are runtime disabled we still get calls to __timer_stats_timer_set_start_info which initializes some fields in the corresponding struct timer_list. So add some quick checks in the the timer stats setup functions to avoid function calls to __timer_stats_timer_set_start_info when timer stats are disabled. In an artificial workload that does nothing but playing ping pong with a single tcp packet via loopback this decreases cpu consumption by 1 - 1.5%. This is part of a modified function trace output on SLES11: perl-2497 [00] 28630647177732388 [+ 125]: sk_reset_timer <-tcp_v4_rcv perl-2497 [00] 28630647177732513 [+ 125]: mod_timer <-sk_reset_timer perl-2497 [00] 28630647177732638 [+ 125]: __timer_stats_timer_set_start_info <-mod_timer perl-2497 [00] 28630647177732763 [+ 125]: __mod_timer <-mod_timer perl-2497 [00] 28630647177732888 [+ 125]: __timer_stats_timer_set_start_info <-__mod_timer perl-2497 [00] 28630647177733013 [+ 93]: lock_timer_base <-__mod_timer Signed-off-by: Heiko Carstens Cc: Andrew Morton Cc: Martin Schwidefsky Cc: Mustafa Mesanovic Cc: Arjan van de Ven LKML-Reference: <20090623153811.GA4641@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/time/timer_stats.c | 16 ++++++++-------- kernel/timer.c | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c994530d166..4cde8b9c716 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex); /* * Collection status, active/inactive: */ -static int __read_mostly active; +int __read_mostly timer_stats_active; /* * Beginning/end timestamps of measurement: @@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, struct entry *entry, input; unsigned long flags; - if (likely(!active)) + if (likely(!timer_stats_active)) return; lock = &per_cpu(lookup_lock, raw_smp_processor_id()); @@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.timer_flag = timer_flag; spin_lock_irqsave(lock, flags); - if (!active) + if (!timer_stats_active) goto out_unlock; entry = tstat_lookup(&input, comm); @@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v) /* * If still active then calculate up to now: */ - if (active) + if (timer_stats_active) time_stop = ktime_get(); time = ktime_sub(time_stop, time_start); @@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf, mutex_lock(&show_mutex); switch (ctl[0]) { case '0': - if (active) { - active = 0; + if (timer_stats_active) { + timer_stats_active = 0; time_stop = ktime_get(); sync_access(); } break; case '1': - if (!active) { + if (!timer_stats_active) { reset_entries(); time_start = ktime_get(); smp_mb(); - active = 1; + timer_stats_active = 1; } break; default: diff --git a/kernel/timer.c b/kernel/timer.c index 54d3912f8ca..0b36b9e5cc8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -380,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer) { unsigned int flag = 0; + if (likely(!timer->start_site)) + return; if (unlikely(tbase_get_deferrable(timer->base))) flag |= TIMER_STATS_FLAG_DEFERRABLE; -- cgit v1.2.3-18-g5258 From 9d612beff5089b89a295a2331883a8ce3fff08c1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Jun 2009 17:33:15 +0800 Subject: tracing: Fix trace_buf_size boot option We should be able to specify [KMG] when setting trace_buf_size boot option, as documented in kernel-parameters.txt Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A41F2DB.4020102@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3bb31006b5c..3aa0a0dfdfa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -284,13 +284,12 @@ void trace_wake_up(void) static int __init set_buf_size(char *str) { unsigned long buf_size; - int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &buf_size); + buf_size = memparse(str, &str); /* nr_entries can not be zero */ - if (ret < 0 || buf_size == 0) + if (buf_size == 0) return 0; trace_buf_size = buf_size; return 1; -- cgit v1.2.3-18-g5258 From f6faac71d502be1c29c81b2f45657662c3b84470 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2009 17:24:40 -0700 Subject: rcu: Mark Hierarchical RCU no longer experimental Removes the warnings about Hierarchical RCU being experimental, given that it has gone through almost six months of being the default RCU in mainline for the x86 with very little trouble. This makes hierarchical-RCU bootup look less scary. Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: dipankar@in.ibm.com Cc: dhowells@redhat.com Cc: lethal@linux-sh.org Cc: kernel@wantstofly.org Cc: cl@linux-foundation.org Cc: schamp@sgi.com Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0dccfbba6d2..7717b95c202 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1533,7 +1533,7 @@ void __init __rcu_init(void) int j; struct rcu_node *rnp; - printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); + printk(KERN_INFO "Hierarchical RCU implementation.\n"); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ @@ -1546,7 +1546,6 @@ void __init __rcu_init(void) rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i); /* Register notifier for non-boot CPUs */ register_cpu_notifier(&rcu_nb); - printk(KERN_WARNING "Experimental hierarchical RCU init done.\n"); } module_param(blimit, int, 0); -- cgit v1.2.3-18-g5258 From 00e54d087afb3867b0b461aef6c1ff433d0df564 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 25 Jun 2009 14:05:27 +0800 Subject: ftrace: Remove duplicate newline Before: # echo 'sys_open:traceon:' > set_ftrace_filter # echo 'sys_close:traceoff:5' > set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:unlimited sys_close:traceoff:count=0 After: # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:unlimited sys_close:traceoff:count=0 Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A4313A7.7030105@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 90f13476483..7402144bff2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -302,8 +302,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, if (count == -1) seq_printf(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld", count); - seq_putc(m, '\n'); + seq_printf(m, ":count=%ld\n", count); return 0; } -- cgit v1.2.3-18-g5258 From 1155de47cd66d0c496d5a6fb2223e980ef1285b2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 25 Jun 2009 14:30:12 +0900 Subject: ring-buffer: Make it generally available In hunting down the cause for the hwlat_detector ring buffer spew in my failed -next builds it became obvious that folks are now treating ring_buffer as something that is generic independent of tracing and thus, suitable for public driver consumption. Given that there are only a few minor areas in ring_buffer that have any reliance on CONFIG_TRACING or CONFIG_FUNCTION_TRACER, provide stubs for those and make it generally available. Signed-off-by: Paul Mundt Cc: Jon Masters Cc: Steven Rostedt LKML-Reference: <20090625053012.GB19944@linux-sh.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.h | 7 +++++++ 3 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0a32cb21ec9..0630e293cd4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04dac263825..bf27bb7a63e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1563,6 +1563,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifdef CONFIG_TRACING + #define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) @@ -1593,6 +1595,13 @@ static void trace_recursive_unlock(void) current->trace_recursion--; } +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -3104,6 +3113,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +#ifdef CONFIG_TRACING static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3171,6 +3181,7 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); +#endif #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6e735d4771f..3548ae5cc78 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -597,6 +597,7 @@ print_graph_function(struct trace_iterator *iter) extern struct pid *ftrace_pid_trace; +#ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { if (!ftrace_pid_trace) @@ -604,6 +605,12 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +#endif /* * trace_iterator_flags is an enumeration that defines bit -- cgit v1.2.3-18-g5258 From aa715284b4d28cabde6c25c568d769a6be712bc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Jun 2009 14:27:58 +0200 Subject: futex: request only one page from get_user_pages() Yanmin noticed that fault_in_user_writeable() requests 4 pages instead of one. That's the result of blindly trusting Linus' proposal :) I even looked up the prototype to verify the correctness: the argument in question is confusingly enough named "len" while in reality it means number of pages. Pointed-out-by: Yanmin Zhang Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1c337112335..794c862125f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -299,7 +299,7 @@ void put_futex_key(int fshared, union futex_key *key) static int fault_in_user_writeable(u32 __user *uaddr) { int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - sizeof(*uaddr), 1, 0, NULL, NULL); + 1, 1, 0, NULL, NULL); return ret < 0 ? ret : 0; } -- cgit v1.2.3-18-g5258 From 7f8b4e4e0988dadfd22330fd147ad2453e19f510 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 14:34:35 +0200 Subject: perf_counter: Add scale information to the mmap control page Add the needed time scale to the self-profile mmap information. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c2b19c11171..23614adab47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1782,6 +1782,12 @@ void perf_counter_update_userpage(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); + userpg->time_enabled = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + + userpg->time_running = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + barrier(); ++userpg->lock; preempt_enable(); -- cgit v1.2.3-18-g5258 From 194002b274e9169a04beb1b23dcc132159bb566c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2009 16:35:24 +0200 Subject: perf_counter, x86: Add mmap counter read support Update the mmap control page with the needed information to use the userspace RDPMC instruction for self monitoring. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 23614adab47..02994a719e2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1753,6 +1753,14 @@ int perf_counter_task_disable(void) return 0; } +static int perf_counter_index(struct perf_counter *counter) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -1777,7 +1785,7 @@ void perf_counter_update_userpage(struct perf_counter *counter) preempt_disable(); ++userpg->lock; barrier(); - userpg->index = counter->hw.idx; + userpg->index = perf_counter_index(counter); userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); -- cgit v1.2.3-18-g5258 From 38b200d67636a30cb8dc1508137908e7a649b5c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Jun 2009 20:13:11 +0200 Subject: perf_counter: Add PERF_EVENT_READ Provide a read() like event which can be used to log the counter value at specific sites such as child->parent folding on exit. In order to be useful, we log the counter parent ID, not the actual counter ID, since userspace can only relate parent IDs to perf_counter_attr constructs. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 02994a719e2..a72c20e9195 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2623,6 +2623,66 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, perf_output_end(&handle); } +/* + * read event + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 value; + u64 format[3]; +}; + +static void +perf_counter_read_event(struct perf_counter *counter, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event event = { + .header = { + .type = PERF_EVENT_READ, + .misc = 0, + .size = sizeof(event) - sizeof(event.format), + }, + .pid = perf_counter_pid(counter, task), + .tid = perf_counter_tid(counter, task), + .value = atomic64_read(&counter->count), + }; + int ret, i = 0; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + event.header.size += sizeof(u64); + event.format[i++] = counter->total_time_enabled; + } + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + event.header.size += sizeof(u64); + event.format[i++] = counter->total_time_running; + } + + if (counter->attr.read_format & PERF_FORMAT_ID) { + u64 id; + + event.header.size += sizeof(u64); + if (counter->parent) + id = counter->parent->id; + else + id = counter->id; + + event.format[i++] = id; + } + + ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); + if (ret) + return; + + perf_output_copy(&handle, &event, event.header.size); + perf_output_end(&handle); +} + /* * fork tracking */ @@ -3985,10 +4045,13 @@ static int inherit_group(struct perf_counter *parent_counter, } static void sync_child_counter(struct perf_counter *child_counter, - struct perf_counter *parent_counter) + struct task_struct *child) { + struct perf_counter *parent_counter = child_counter->parent; u64 child_val; + perf_counter_read_event(child_counter, child); + child_val = atomic64_read(&child_counter->count); /* @@ -4017,7 +4080,8 @@ static void sync_child_counter(struct perf_counter *child_counter, static void __perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx) + struct perf_counter_context *child_ctx, + struct task_struct *child) { struct perf_counter *parent_counter; @@ -4031,7 +4095,7 @@ __perf_counter_exit_task(struct perf_counter *child_counter, * counters need to be zapped - but otherwise linger. */ if (parent_counter) { - sync_child_counter(child_counter, parent_counter); + sync_child_counter(child_counter, child); free_counter(child_counter); } } @@ -4093,7 +4157,7 @@ void perf_counter_exit_task(struct task_struct *child) again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) - __perf_counter_exit_task(child_counter, child_ctx); + __perf_counter_exit_task(child_counter, child_ctx, child); /* * If the last counter was a group counter, it will have appended all -- cgit v1.2.3-18-g5258 From bfbd3381e63aa2a14c6706afb50ce4630aa0d9a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Jun 2009 21:11:59 +0200 Subject: perf_counter: Implement more accurate per task statistics With the introduction of PERF_EVENT_READ we have the possibility to provide accurate counter values for individual tasks in a task hierarchy. However, due to the lazy context switching used for similar counter contexts our current per task counts are way off. In order to maintain some of the lazy switch benefits we don't disable it out-right, but simply iterate the active counters and flip the values between the contexts. This only reads the counters but does not need to reprogram the full PMU. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a72c20e9195..385ca51c6e6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -236,6 +236,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; + if (counter->attr.inherit_stat) + ctx->nr_stat++; } /* @@ -250,6 +252,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; + if (counter->attr.inherit_stat) + ctx->nr_stat--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -1006,6 +1010,76 @@ static int context_equiv(struct perf_counter_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } +static void __perf_counter_read(void *counter); + +static void __perf_counter_sync_stat(struct perf_counter *counter, + struct perf_counter *next_counter) +{ + u64 value; + + if (!counter->attr.inherit_stat) + return; + + /* + * Update the counter value, we cannot use perf_counter_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the counter must be on the current CPU, therefore we + * don't need to use it. + */ + switch (counter->state) { + case PERF_COUNTER_STATE_ACTIVE: + __perf_counter_read(counter); + break; + + case PERF_COUNTER_STATE_INACTIVE: + update_counter_times(counter); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the counter + * values when we flip the contexts. + */ + value = atomic64_read(&next_counter->count); + value = atomic64_xchg(&counter->count, value); + atomic64_set(&next_counter->count, value); + + /* + * XXX also sync time_enabled and time_running ? + */ +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_counter_sync_stat(struct perf_counter_context *ctx, + struct perf_counter_context *next_ctx) +{ + struct perf_counter *counter, *next_counter; + + if (!ctx->nr_stat) + return; + + counter = list_first_entry(&ctx->event_list, + struct perf_counter, event_entry); + + next_counter = list_first_entry(&next_ctx->event_list, + struct perf_counter, event_entry); + + while (&counter->event_entry != &ctx->event_list && + &next_counter->event_entry != &next_ctx->event_list) { + + __perf_counter_sync_stat(counter, next_counter); + + counter = list_next_entry(counter, event_entry); + next_counter = list_next_entry(counter, event_entry); + } +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -1061,6 +1135,8 @@ void perf_counter_task_sched_out(struct task_struct *task, ctx->task = next; next_ctx->task = task; do_switch = 0; + + perf_counter_sync_stat(ctx, next_ctx); } spin_unlock(&next_ctx->lock); spin_unlock(&ctx->lock); @@ -1350,7 +1426,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Cross CPU call to read the hardware counter */ -static void __read(void *info) +static void __perf_counter_read(void *info) { struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; @@ -1372,7 +1448,7 @@ static u64 perf_counter_read(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, - __read, counter, 1); + __perf_counter_read, counter, 1); } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); } @@ -4050,7 +4126,8 @@ static void sync_child_counter(struct perf_counter *child_counter, struct perf_counter *parent_counter = child_counter->parent; u64 child_val; - perf_counter_read_event(child_counter, child); + if (child_counter->attr.inherit_stat) + perf_counter_read_event(child_counter, child); child_val = atomic64_read(&child_counter->count); -- cgit v1.2.3-18-g5258 From e6e18ec79b023d5fe84226cef533cf0e3770ce93 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jun 2009 11:27:12 +0200 Subject: perf_counter: Rework the sample ABI The PERF_EVENT_READ implementation made me realize we don't actually need the sample_type int the output sample, since we already have that in the perf_counter_attr information. Therefore, remove the PERF_EVENT_MISC_OVERFLOW bit and the event->type overloading, and imply put counter overflow samples in a PERF_EVENT_SAMPLE type. This also fixes the issue that event->type was only 32-bit and sample_type had 64 usable bits. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 385ca51c6e6..f2f23269658 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2575,15 +2575,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u32 cpu, reserved; } cpu_entry; - header.type = 0; + header.type = PERF_EVENT_SAMPLE; header.size = sizeof(header); - header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc = 0; header.misc |= perf_misc_flags(data->regs); if (sample_type & PERF_SAMPLE_IP) { ip = perf_instruction_pointer(data->regs); - header.type |= PERF_SAMPLE_IP; header.size += sizeof(ip); } @@ -2592,7 +2591,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, tid_entry.pid = perf_counter_pid(counter, current); tid_entry.tid = perf_counter_tid(counter, current); - header.type |= PERF_SAMPLE_TID; header.size += sizeof(tid_entry); } @@ -2602,34 +2600,25 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, */ time = sched_clock(); - header.type |= PERF_SAMPLE_TIME; header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_ADDR) { - header.type |= PERF_SAMPLE_ADDR; + if (sample_type & PERF_SAMPLE_ADDR) header.size += sizeof(u64); - } - if (sample_type & PERF_SAMPLE_ID) { - header.type |= PERF_SAMPLE_ID; + if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); - } if (sample_type & PERF_SAMPLE_CPU) { - header.type |= PERF_SAMPLE_CPU; header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); } - if (sample_type & PERF_SAMPLE_PERIOD) { - header.type |= PERF_SAMPLE_PERIOD; + if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - } if (sample_type & PERF_SAMPLE_GROUP) { - header.type |= PERF_SAMPLE_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } @@ -2639,10 +2628,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - - header.type |= PERF_SAMPLE_CALLCHAIN; header.size += callchain_size; - } + } else + header.size += sizeof(u64); } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2693,8 +2681,14 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + else { + u64 nr = 0; + perf_output_put(&handle, nr); + } + } perf_output_end(&handle); } -- cgit v1.2.3-18-g5258 From 5211a242d0cbdded372aee59da18f80552b0a80a Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Wed, 24 Jun 2009 14:32:11 -0700 Subject: x86: Add sysctl to allow panic on IOCK NMI error This patch introduces a new sysctl: /proc/sys/kernel/panic_on_io_nmi which defaults to 0 (off). When enabled, the kernel panics when the kernel receives an NMI caused by an IO error. The IO error triggered NMI indicates a serious system condition, which could result in IO data corruption. Rather than contiuing, panicing and dumping might be a better choice, so one can figure out what's causing the IO error. This could be especially important to companies running IO intensive applications where corruption must be avoided, e.g. a bank's databases. [ SuSE has been shipping it for a while, it was done at the request of a large database vendor, for their users. ] Signed-off-by: Kurt Garloff Signed-off-by: Roberto Angelino Signed-off-by: Greg Kroah-Hartman Cc: "Eric W. Biederman" LKML-Reference: <20090624213211.GA11291@kroah.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62e4ff9968b..fba42eda8de 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.3-18-g5258 From 0296e4254f3318e0dcad9706fa1daf8e5addc1e9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 26 Jun 2009 11:15:37 +0800 Subject: ftrace: Fix the output of profile The first entry of the ftrace profile was always skipped when reading trace_stat/functionX. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A443D59.4080307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71a52c17214..f3716bf04df 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -291,7 +291,9 @@ function_stat_next(void *v, int idx) pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: - rec++; + if (idx != 0) + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { pg = pg->next; if (!pg) -- cgit v1.2.3-18-g5258 From 19d2e755436054dfc2be640bffc32e427c37ac3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Jun 2009 13:10:23 +0200 Subject: perf_counter: Complete counter swap Complete the counter swap by indeed switching the times too and updating the userpage after modifying the counter values. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1246014623.31755.195.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f2f23269658..66ab1e9d129 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1048,9 +1048,14 @@ static void __perf_counter_sync_stat(struct perf_counter *counter, value = atomic64_xchg(&counter->count, value); atomic64_set(&next_counter->count, value); + swap(counter->total_time_enabled, next_counter->total_time_enabled); + swap(counter->total_time_running, next_counter->total_time_running); + /* - * XXX also sync time_enabled and time_running ? + * Since we swizzled the values, update the user visible data too. */ + perf_counter_update_userpage(counter); + perf_counter_update_userpage(next_counter); } #define list_next_entry(pos, member) \ -- cgit v1.2.3-18-g5258 From a32c7765e2796395aec49f699bd25c407155e9c5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 26 Jun 2009 16:55:51 +0800 Subject: tracing: Fix stack tracer sysctl handling This made my machine completely frozen: # echo 1 > /proc/sys/kernel/stack_tracer_enabled # echo 2 > /proc/sys/kernel/stack_tracer_enabled The cause is register_ftrace_function() was called twice. Also fix ftrace_enabled sysctl, though seems nothing bad happened as I tested it. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A448D17.9010305@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace_stack.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3716bf04df..bce9e01a29c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3160,10 +3160,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ret = proc_dointvec(table, write, file, buffer, lenp, ppos); - if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = ftrace_enabled; + last_ftrace_enabled = !!ftrace_enabled; if (ftrace_enabled) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2d7aebd71db..e644af91012 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -326,10 +326,10 @@ stack_trace_sysctl(struct ctl_table *table, int write, ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || - (last_stack_tracer_enabled == stack_tracer_enabled)) + (last_stack_tracer_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = stack_tracer_enabled; + last_stack_tracer_enabled = !!stack_tracer_enabled; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); -- cgit v1.2.3-18-g5258 From 82d5308127c3e3404ffbf41e503853c68660b18b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 26 Jun 2009 17:07:02 +0800 Subject: trace_export: Repair missed fields Some fields for struct ftrace_graph_ret are missed when they are exported to user. Signed-off-by: Lai Jiangshan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A448FB6.5000302@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5e32e375134..6db005e1248 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -26,6 +26,9 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, ftrace_graph_ret_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ret.func, func) + TRACE_FIELD(unsigned long long, ret.calltime, calltime) + TRACE_FIELD(unsigned long long, ret.rettime, rettime) + TRACE_FIELD(unsigned long, ret.overrun, overrun) TRACE_FIELD(int, ret.depth, depth) ), TP_RAW_FMT("<-- %lx (%d)") -- cgit v1.2.3-18-g5258 From 238a24f626628cb16a3015f332d649f08246ca89 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Jun 2009 15:55:10 +0800 Subject: tracing/fastboot: Document the need of initcall_debug To use boot tracer, one should pass initcall_debug as well as ftrace=initcall to the command line. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <4A48735E.9050002@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1551f47e766..019f380fd76 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -226,13 +226,13 @@ config BOOT_TRACER the timings of the initcalls and traces key events and the identity of tasks that can cause boot delays, such as context-switches. - Its aim is to be parsed by the /scripts/bootgraph.pl tool to + Its aim is to be parsed by the scripts/bootgraph.pl tool to produce pretty graphics about boot inefficiencies, giving a visual representation of the delays during initcalls - but the raw /debug/tracing/trace text output is readable too. - You must pass in ftrace=initcall to the kernel command line - to enable this on bootup. + You must pass in initcall_debug and ftrace=initcall to the kernel + command line to enable this on bootup. config TRACE_BRANCH_PROFILING bool -- cgit v1.2.3-18-g5258 From 12de38b186c2af97bf0b4a1f907f766df46b1def Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 29 Jun 2009 17:13:55 +0100 Subject: kmemleak: Inform kmemleak about pid_hash Kmemleak does not track alloc_bootmem calls but the pid_hash allocated in pidhash_init() would need to be scanned as it contains pointers to struct pid objects. Signed-off-by: Catalin Marinas --- kernel/pid.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5d3f5..5fa1db48d8b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -512,6 +513,12 @@ void __init pidhash_init(void) pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); if (!pid_hash) panic("Could not alloc pidhash!\n"); + /* + * pid_hash contains references to allocated struct pid objects and it + * must be scanned by kmemleak to avoid false positives. + */ + kmemleak_alloc(pid_hash, pidhash_size * sizeof(*(pid_hash)), 0, + GFP_KERNEL); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } -- cgit v1.2.3-18-g5258 From 57e7986ed142417498155ebcd5eaf617ac37136d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 30 Jun 2009 16:07:19 +1000 Subject: perf_counter: Provide a way to enable counters on exec This provides a way to mark a counter to be enabled on the next exec. This is useful for measuring the total activity of a program without including overhead from the process that launches it. This also changes the perf stat command to use this new facility. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <19017.43927.838745.689203@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 66ab1e9d129..d55a50da234 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1428,6 +1428,53 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } +/* + * Enable all of a task's counters that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_counter_enable_on_exec(struct task_struct *task) +{ + struct perf_counter_context *ctx; + struct perf_counter *counter; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_counter_ctxp; + if (!ctx || !ctx->nr_counters) + goto out; + + __perf_counter_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (!counter->attr.enable_on_exec) + continue; + counter->attr.enable_on_exec = 0; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + continue; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; + enabled = 1; + } + + /* + * Unclone this context if we enabled any counter. + */ + if (enabled && ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + /* * Cross CPU call to read the hardware counter */ @@ -2949,6 +2996,9 @@ void perf_counter_comm(struct task_struct *task) { struct perf_comm_event comm_event; + if (task->perf_counter_ctxp) + perf_counter_enable_on_exec(task); + if (!atomic_read(&nr_comm_counters)) return; -- cgit v1.2.3-18-g5258 From 8bc1ad7dd301b7ca7454013519fa92e8c53655ff Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 30 Jun 2009 11:41:31 -0700 Subject: kernel/resource.c: fix sign extension in reserve_setup() When the 32-bit signed quantities get assigned to the u64 resource_size_t, they are incorrectly sign-extended. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13253 Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9905 Signed-off-by: Zhang Rui Reported-by: Leann Ogasawara Cc: Pierre Ossman Reported-by: Tested-by: Cc: Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds