From 459ec28ab404d7afcd512ce9b855959ad301605a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 17:33:44 +0200 Subject: perf_counter: Allow mmap if paranoid checks are turned off Before: $ perf sched record -f sleep 1 Error: failed to mmap with 1 (Operation not permitted) After: $ perf sched record -f sleep 1 [ perf record: Captured and wrote 0.095 MB perf.data (~4161 samples) ] Note, this is only allowed if perfcounter_paranoid is set to the most permissive (non-default) value of -1. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e0d91fdf0c3..667ab25ad3d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2315,7 +2315,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) lock_limit >>= PAGE_SHIFT; locked = vma->vm_mm->locked_vm + extra; - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; } -- cgit v1.2.3-70-g09d2 From f977bb4937857994312fff4f9c2cad336a36a932 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 18:15:54 +0200 Subject: perf_counter, sched: Add sched_stat_runtime tracepoint This allows more precise tracking of how the scheduler accounts (and acts upon) a task having spent N nanoseconds of CPU time. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 33 +++++++++++++++++++++++++++++++++ kernel/sched_fair.c | 1 + 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index b48f1ad7c94..4069c43f418 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -379,6 +379,39 @@ TRACE_EVENT(sched_stat_wait, (unsigned long long)__entry->delay) ); +/* + * Tracepoint for accounting runtime (time the task is executing + * on a CPU). + */ +TRACE_EVENT(sched_stat_runtime, + + TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), + + TP_ARGS(tsk, runtime, vruntime), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, runtime ) + __field( u64, vruntime ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->runtime = runtime; + __entry->vruntime = vruntime; + ) + TP_perf_assign( + __perf_count(runtime); + ), + + TP_printk("task: %s:%d runtime: %Lu [ns], vruntime: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->runtime, + (unsigned long long)__entry->vruntime) +); + /* * Tracepoint for accounting sleep time (time the task is not runnable, * including iowait, see below). diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f8412101..a097e909e80 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } -- cgit v1.2.3-70-g09d2 From 850bc73ffcc99cddfb52bc23217c60810c508853 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Sep 2009 18:47:11 +0200 Subject: perf_counter: Do not throttle single swcounter events We can have swcounter events that contribute more than a single count per event, when used with a non-zero period, those can generate multiple events, which is when we need throttling. However, swcounter that contribute only a single count per event can only come as fast as we can run code, hence don't throttle them. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 667ab25ad3d..fe0d1adde80 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3494,14 +3494,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) * Generic counter overflow handling, sampling. */ -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) +static int __perf_counter_overflow(struct perf_counter *counter, int nmi, + int throttle, struct perf_sample_data *data) { int events = atomic_read(&counter->event_limit); - int throttle = counter->pmu->unthrottle != NULL; struct hw_perf_counter *hwc = &counter->hw; int ret = 0; + throttle = (throttle && counter->pmu->unthrottle != NULL); + if (!throttle) { hwc->interrupts++; } else { @@ -3554,6 +3555,12 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, return ret; } +int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) +{ + return __perf_counter_overflow(counter, nmi, 1, data); +} + /* * Generic software counter infrastructure */ @@ -3592,6 +3599,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { struct hw_perf_counter *hwc = &counter->hw; + int throttle = 0; u64 overflow; data->period = counter->hw.last_period; @@ -3601,13 +3609,14 @@ static void perf_swcounter_overflow(struct perf_counter *counter, return; for (; overflow; overflow--) { - if (perf_counter_overflow(counter, nmi, data)) { + if (__perf_counter_overflow(counter, nmi, throttle, data)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. */ break; } + throttle = 0; } } -- cgit v1.2.3-70-g09d2 From 2667de81f3256c944b06abdf2c56c2f192fcb724 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Sep 2009 19:01:10 +0200 Subject: perf_counter: Allow for a wakeup watermark Currently we wake the mmap() consumer once every PAGE_SIZE of data and/or once event wakeup_events when specified. For high speed sampling this results in too many wakeups wrt. the buffer size, hence change this. We move the default wakeup limit to 1/4-th the buffer size, and provide for means to manually specify this limit. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++++-- kernel/perf_counter.c | 32 +++++++++++++++++++------------- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 972f90d7a32..6c1ef72ea50 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -199,10 +199,14 @@ struct perf_counter_attr { inherit_stat : 1, /* per task counts */ enable_on_exec : 1, /* next exec enables */ task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ - __reserved_1 : 50; + __reserved_1 : 49; - __u32 wakeup_events; /* wakeup every n events */ + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; __u32 __reserved_2; __u64 __reserved_3; @@ -521,6 +525,8 @@ struct perf_mmap_data { atomic_t wakeup; /* needs a wakeup */ atomic_t lost; /* nr records lost */ + long watermark; /* wakeup watermark */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fe0d1adde80..29b73b6e814 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) data->nr_pages = nr_pages; atomic_set(&data->lock, -1); + if (counter->attr.watermark) { + data->watermark = min_t(long, PAGE_SIZE * nr_pages, + counter->attr.wakeup_watermark); + } + if (!data->watermark) + data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); + rcu_assign_pointer(counter->data, data); return 0; @@ -2517,23 +2524,15 @@ struct perf_output_handle { unsigned long flags; }; -static bool perf_output_space(struct perf_mmap_data *data, - unsigned int offset, unsigned int head) +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) { - unsigned long tail; unsigned long mask; if (!data->writable) return true; mask = (data->nr_pages << PAGE_SHIFT) - 1; - /* - * Userspace could choose to issue a mb() before updating the tail - * pointer. So that all reads will be completed before the write is - * issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2679,7 +2678,7 @@ static int perf_output_begin(struct perf_output_handle *handle, { struct perf_counter *output_counter; struct perf_mmap_data *data; - unsigned int offset, head; + unsigned long tail, offset, head; int have_lost; struct { struct perf_event_header header; @@ -2717,16 +2716,23 @@ static int perf_output_begin(struct perf_output_handle *handle, perf_output_lock(handle); do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); offset = head = atomic_long_read(&data->head); head += size; - if (unlikely(!perf_output_space(data, offset, head))) + if (unlikely(!perf_output_space(data, tail, offset, head))) goto fail; } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; handle->head = head; - if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) + if (head - tail > data->watermark) atomic_set(&data->wakeup, 1); if (have_lost) { -- cgit v1.2.3-70-g09d2 From 5622f295b53fb60dbf9bed3e2c89d182490a8b7f Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 15 Sep 2009 13:00:23 +0200 Subject: x86, perf_counter, bts: Optimize BTS overflow handling Draining the BTS buffer on a buffer overflow interrupt takes too long resulting in a kernel lockup when tracing the kernel. Restructure perf_counter sampling into sample creation and sample output. Prepare a single reference sample for BTS sampling and update the from and to address fields when draining the BTS buffer. Drain the entire BTS buffer between a single perf_output_begin() / perf_output_end() pair. Signed-off-by: Markus Metzger Signed-off-by: Peter Zijlstra LKML-Reference: <20090915130023.A16204@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 60 ++++--- include/linux/perf_counter.h | 68 +++++++- kernel/perf_counter.c | 312 ++++++++++++++++++++----------------- 3 files changed, 266 insertions(+), 174 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f9cd0849bd4..6a0e71b3812 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -36,10 +36,10 @@ static u64 perf_counter_mask __read_mostly; #define BTS_RECORD_SIZE 24 /* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) /* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) /* @@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, - struct perf_sample_data *data) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) { struct debug_store *ds = cpuc->ds; struct bts_record { @@ -1498,8 +1497,11 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, u64 flags; }; struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - unsigned long orig_ip = data->regs->ip; struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; if (!counter) return; @@ -1510,19 +1512,38 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; + if (top <= at) + return; + ds->bts_index = ds->bts_buffer_base; + + data.period = counter->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, counter, ®s); + + if (perf_output_begin(&handle, counter, + header.size * (top - at), 1, 1)) + return; + for (; at < top; at++) { - data->regs->ip = at->from; - data->addr = at->to; + data.ip = at->from; + data.addr = at->to; - perf_counter_output(counter, 1, data); + perf_output_sample(&handle, &header, &data, counter); } - data->regs->ip = orig_ip; - data->addr = 0; + perf_output_end(&handle); /* There's new data available. */ + counter->hw.interrupts++; counter->pending_kill = POLL_IN; } @@ -1552,13 +1573,9 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - struct perf_sample_data data; - struct pt_regs regs; + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); - data.regs = ®s; - intel_pmu_drain_bts_buffer(cpuc, &data); - } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1619,7 +1636,6 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1644,7 +1660,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) + if (perf_counter_overflow(counter, 1, &data, regs)) p6_pmu_disable_counter(hwc, idx); } @@ -1665,13 +1681,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.regs = regs; data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); - intel_pmu_drain_bts_buffer(cpuc, &data); + intel_pmu_drain_bts_buffer(cpuc); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1702,7 +1717,7 @@ again: data.period = counter->hw.last_period; - if (perf_counter_overflow(counter, 1, &data)) + if (perf_counter_overflow(counter, 1, &data, regs)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1729,7 +1744,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1754,7 +1768,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) + if (perf_counter_overflow(counter, 1, &data, regs)) amd_pmu_disable_counter(hwc, idx); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6c1ef72ea50..c7375f97aa1 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -691,6 +691,17 @@ struct perf_cpu_context { int recursion[4]; }; +struct perf_output_handle { + struct perf_counter *counter; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; +}; + #ifdef CONFIG_PERF_COUNTERS /* @@ -722,16 +733,38 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); struct perf_sample_data { - struct pt_regs *regs; + u64 type; + + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; u64 addr; + u64 id; + u64 stream_id; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; u64 period; + struct perf_callchain_entry *callchain; struct perf_raw_record *raw; }; +extern void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter); +extern void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter, + struct pt_regs *regs); + extern int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data); -extern void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data); + struct perf_sample_data *data, + struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter @@ -781,6 +814,12 @@ extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, #define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size, + int nmi, int sample); +extern void perf_output_end(struct perf_output_handle *handle); +extern void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len); #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -807,7 +846,28 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma) { } static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } + +static inline int +perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c, + unsigned int size, int nmi, int sample) { } +static inline void perf_output_end(struct perf_output_handle *handle) { } +static inline void +perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) { } +static inline void +perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter) { } +static inline void +perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter, + struct pt_regs *regs) { } #endif +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + #endif /* __KERNEL__ */ #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 29b73b6e814..215845243a6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2512,18 +2512,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) /* * Output */ - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, unsigned long offset, unsigned long head) { @@ -2633,8 +2621,8 @@ out: local_irq_restore(handle->flags); } -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) { unsigned int pages_mask; unsigned int offset; @@ -2669,12 +2657,9 @@ static void perf_output_copy(struct perf_output_handle *handle, WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); } -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) +int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size, + int nmi, int sample) { struct perf_counter *output_counter; struct perf_mmap_data *data; @@ -2756,7 +2741,7 @@ out: return -ENOSPC; } -static void perf_output_end(struct perf_output_handle *handle) +void perf_output_end(struct perf_output_handle *handle) { struct perf_counter *counter = handle->counter; struct perf_mmap_data *data = handle->data; @@ -2870,82 +2855,151 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, counter); } -void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, counter); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter, + struct pt_regs *regs) { - int ret; u64 sample_type = counter->attr.sample_type; - struct perf_output_handle handle; - struct perf_event_header header; - u64 ip; - struct { - u32 pid, tid; - } tid_entry; - struct perf_callchain_entry *callchain = NULL; - int callchain_size = 0; - u64 time; - struct { - u32 cpu, reserved; - } cpu_entry; - header.type = PERF_EVENT_SAMPLE; - header.size = sizeof(header); + data->type = sample_type; - header.misc = 0; - header.misc |= perf_misc_flags(data->regs); + header->type = PERF_EVENT_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(data->regs); - header.size += sizeof(ip); + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); } if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ - tid_entry.pid = perf_counter_pid(counter, current); - tid_entry.tid = perf_counter_tid(counter, current); + data->tid_entry.pid = perf_counter_pid(counter, current); + data->tid_entry.tid = perf_counter_tid(counter, current); - header.size += sizeof(tid_entry); + header->size += sizeof(data->tid_entry); } if (sample_type & PERF_SAMPLE_TIME) { /* * Maybe do better on x86 and provide cpu_clock_nmi() */ - time = sched_clock(); + data->time = sched_clock(); - header.size += sizeof(u64); + header->size += sizeof(data->time); } if (sample_type & PERF_SAMPLE_ADDR) - header.size += sizeof(u64); + header->size += sizeof(data->addr); - if (sample_type & PERF_SAMPLE_ID) - header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_counter_id(counter); - if (sample_type & PERF_SAMPLE_STREAM_ID) - header.size += sizeof(u64); + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = counter->id; + + header->size += sizeof(data->stream_id); + } if (sample_type & PERF_SAMPLE_CPU) { - header.size += sizeof(cpu_entry); + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; - cpu_entry.cpu = raw_smp_processor_id(); - cpu_entry.reserved = 0; + header->size += sizeof(data->cpu_entry); } if (sample_type & PERF_SAMPLE_PERIOD) - header.size += sizeof(u64); + header->size += sizeof(data->period); if (sample_type & PERF_SAMPLE_READ) - header.size += perf_counter_read_size(counter); + header->size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(data->regs); + int size = 1; - if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); - header.size += callchain_size; - } else - header.size += sizeof(u64); + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); } if (sample_type & PERF_SAMPLE_RAW) { @@ -2957,69 +3011,23 @@ void perf_counter_output(struct perf_counter *counter, int nmi, size += sizeof(u32); WARN_ON_ONCE(size & (sizeof(u64)-1)); - header.size += size; - } - - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); - if (ret) - return; - - perf_output_put(&handle, header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(&handle, ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(&handle, tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(&handle, time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - u64 id = primary_counter_id(counter); - - perf_output_put(&handle, id); + header->size += size; } +} - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(&handle, counter->id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(&handle, cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, data->period); +static void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(&handle, counter); + perf_prepare_sample(&header, data, counter, regs); - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - else { - u64 nr = 0; - perf_output_put(&handle, nr); - } - } + if (perf_output_begin(&handle, counter, header.size, nmi, 1)) + return; - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(&handle, data->raw->size); - perf_output_copy(&handle, data->raw->data, data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(&handle, raw); - } - } + perf_output_sample(&handle, &header, data, counter); perf_output_end(&handle); } @@ -3501,7 +3509,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) */ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, - int throttle, struct perf_sample_data *data) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&counter->event_limit); struct hw_perf_counter *hwc = &counter->hw; @@ -3557,14 +3566,15 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, data); + perf_counter_output(counter, nmi, data, regs); return ret; } int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { - return __perf_counter_overflow(counter, nmi, 1, data); + return __perf_counter_overflow(counter, nmi, 1, data, regs); } /* @@ -3602,7 +3612,8 @@ again: } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) { struct hw_perf_counter *hwc = &counter->hw; int throttle = 0; @@ -3615,7 +3626,8 @@ static void perf_swcounter_overflow(struct perf_counter *counter, return; for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, data)) { + if (__perf_counter_overflow(counter, nmi, throttle, + data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. @@ -3634,7 +3646,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter) } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) { struct hw_perf_counter *hwc = &counter->hw; @@ -3643,11 +3656,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, if (!hwc->sample_period) return; - if (!data->regs) + if (!regs) return; if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data); + perf_swcounter_overflow(counter, nmi, data, regs); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3706,7 +3719,8 @@ static int perf_swcounter_match(struct perf_counter *counter, static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { struct perf_counter *counter; @@ -3715,8 +3729,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, data->regs)) - perf_swcounter_add(counter, nr, nmi, data); + if (perf_swcounter_match(counter, type, event, regs)) + perf_swcounter_add(counter, nr, nmi, data, regs); } rcu_read_unlock(); } @@ -3737,7 +3751,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) static void do_perf_swcounter_event(enum perf_type_id type, u32 event, u64 nr, int nmi, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -3750,7 +3765,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event, barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data); + nr, nmi, data, regs); rcu_read_lock(); /* * doesn't really matter which of the child contexts the @@ -3758,7 +3773,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event, */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); rcu_read_unlock(); barrier(); @@ -3772,11 +3787,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { - .regs = regs, .addr = addr, }; - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); + do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, + &data, regs); } static void perf_swcounter_read(struct perf_counter *counter) @@ -3813,6 +3828,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; + struct pt_regs *regs; struct perf_counter *counter; u64 period; @@ -3820,17 +3836,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) counter->pmu->read(counter); data.addr = 0; - data.regs = get_irq_regs(); + regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->attr.exclude_kernel || !data.regs) && + if ((counter->attr.exclude_kernel || !regs) && !counter->attr.exclude_user) - data.regs = task_pt_regs(current); + regs = task_pt_regs(current); - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) + if (regs) { + if (perf_counter_overflow(counter, 0, &data, regs)) ret = HRTIMER_NORESTART; } @@ -3966,15 +3982,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, }; struct perf_sample_data data = { - .regs = get_irq_regs(), .addr = addr, .raw = &raw, }; - if (!data.regs) - data.regs = task_pt_regs(current); + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3-70-g09d2 From cf450a7355a116af793998c118a6bcf7f5a8367e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Sep 2009 12:18:14 +0200 Subject: perf_counter: Fix up swcounter throttling /me dons the brown paper bag. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 215845243a6..6944bd55ec4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3634,7 +3634,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter, */ break; } - throttle = 0; + throttle = 1; } } -- cgit v1.2.3-70-g09d2 From def0a9b2573e00ab0b486cb5382625203ab4c4a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Sep 2009 20:14:01 +0200 Subject: sched_clock: Make it NMI safe Arjan complained about the suckyness of TSC on modern machines, and asked if we could do something about that for PERF_SAMPLE_TIME. Make cpu_clock() NMI safe by removing the spinlock and using cmpxchg. This also makes it smaller and more robust. Affects architectures that use HAVE_UNSTABLE_SCHED_CLOCK, i.e. IA64 and x86. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 ++-- kernel/sched_clock.c | 122 ++++++++++++++++++++++---------------------------- 2 files changed, 56 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6944bd55ec4..06d233a06da 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2955,10 +2955,7 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - data->time = sched_clock(); + data->time = perf_clock(); header->size += sizeof(data->time); } @@ -3488,7 +3485,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), + .time = perf_clock(), .id = primary_counter_id(counter), .stream_id = counter->id, }; @@ -3540,7 +3537,7 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, } if (counter->attr.freq) { - u64 now = sched_clock(); + u64 now = perf_clock(); s64 delta = now - hwc->freq_stamp; hwc->freq_stamp = now; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e1d16c9a768..ac2e1dc708b 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running; __read_mostly int sched_clock_stable; struct sched_clock_data { - /* - * Raw spinlock - this is a special case: this might be called - * from within instrumentation code so we dont want to do any - * instrumentation ourselves. - */ - raw_spinlock_t lock; - u64 tick_raw; u64 tick_gtod; u64 clock; @@ -80,7 +73,6 @@ void sched_clock_init(void) for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) +static u64 sched_clock_local(struct sched_clock_data *scd) { - s64 delta = now - scd->tick_raw; - u64 clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock; + s64 delta; +again: + now = sched_clock(); + delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; + old_clock = scd->clock; + /* * scd->clock = clamp(scd->tick_gtod + delta, * max(scd->tick_gtod, scd->clock), @@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) */ clock = scd->tick_gtod + delta; - min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); + min_clock = wrap_max(scd->tick_gtod, old_clock); + max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - scd->clock = clock; + if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) + goto again; - return scd->clock; + return clock; } -static void lock_double_clock(struct sched_clock_data *data1, - struct sched_clock_data *data2) +static u64 sched_clock_remote(struct sched_clock_data *scd) { - if (data1 < data2) { - __raw_spin_lock(&data1->lock); - __raw_spin_lock(&data2->lock); + struct sched_clock_data *my_scd = this_scd(); + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + + sched_clock_local(my_scd); +again: + this_clock = my_scd->clock; + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + ptr = &scd->clock; + old_val = remote_clock; + val = this_clock; } else { - __raw_spin_lock(&data2->lock); - __raw_spin_lock(&data1->lock); + /* + * Should be rare, but possible: + */ + ptr = &my_scd->clock; + old_val = this_clock; + val = remote_clock; } + + if (cmpxchg(ptr, old_val, val) != old_val) + goto again; + + return val; } u64 sched_clock_cpu(int cpu) { - u64 now, clock, this_clock, remote_clock; struct sched_clock_data *scd; + u64 clock; + + WARN_ON_ONCE(!irqs_disabled()); if (sched_clock_stable) return sched_clock(); - scd = cpu_sdc(cpu); - - /* - * Normally this is not called in NMI context - but if it is, - * trying to do any locking here is totally lethal. - */ - if (unlikely(in_nmi())) - return scd->clock; - if (unlikely(!sched_clock_running)) return 0ull; - WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); - - if (cpu != raw_smp_processor_id()) { - struct sched_clock_data *my_scd = this_scd(); - - lock_double_clock(scd, my_scd); - - this_clock = __update_sched_clock(my_scd, now); - remote_clock = scd->clock; - - /* - * Use the opportunity that we have both locks - * taken to couple the two clocks: we take the - * larger time as the latest time for both - * runqueues. (this creates monotonic movement) - */ - if (likely((s64)(remote_clock - this_clock) < 0)) { - clock = this_clock; - scd->clock = clock; - } else { - /* - * Should be rare, but possible: - */ - clock = remote_clock; - my_scd->clock = remote_clock; - } - - __raw_spin_unlock(&my_scd->lock); - } else { - __raw_spin_lock(&scd->lock); - clock = __update_sched_clock(scd, now); - } + scd = cpu_sdc(cpu); - __raw_spin_unlock(&scd->lock); + if (cpu != smp_processor_id()) + clock = sched_clock_remote(scd); + else + clock = sched_clock_local(scd); return clock; } @@ -223,11 +209,9 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); now = sched_clock(); - __raw_spin_lock(&scd->lock); scd->tick_raw = now; scd->tick_gtod = now_gtod; - __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); + sched_clock_local(scd); } /* -- cgit v1.2.3-70-g09d2 From 393b2ad8c757ba3ccd2a99ca5bbcd6db4d3fa53d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 12 Sep 2009 07:52:47 +0200 Subject: perf: Add a timestamp to fork events perf timechart needs to know when a process forked, in order to be able to visualize properly when tasks start. This patch adds a time field to the event structure, and fills it in appropriately. Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: <20090912130341.51ad2de2@infradead.org> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 11 +++++++++-- tools/perf/util/event.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c7375f97aa1..bd341007c4f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -336,6 +336,7 @@ enum perf_event_type { * struct perf_event_header header; * u32 pid, ppid; * u32 tid, ptid; + * u64 time; * }; */ PERF_EVENT_EXIT = 4, @@ -356,6 +357,7 @@ enum perf_event_type { * struct perf_event_header header; * u32 pid, ppid; * u32 tid, ptid; + * { u64 time; } && PERF_SAMPLE_TIME * }; */ PERF_EVENT_FORK = 7, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d013f4e89e9..d5899b62b27 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3083,6 +3083,7 @@ struct perf_task_event { u32 ppid; u32 tid; u32 ptid; + u64 time; } event; }; @@ -3090,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter, struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = task_event->event.header.size; + int size; struct task_struct *task = task_event->task; - int ret = perf_output_begin(&handle, counter, size, 0, 0); + int ret; + + size = task_event->event.header.size; + ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; @@ -3103,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter, task_event->event.tid = perf_counter_tid(counter, task); task_event->event.ptid = perf_counter_tid(counter, current); + task_event->event.time = perf_clock(); + perf_output_put(&handle, task_event->event); + perf_output_end(&handle); } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 2495529cae7..28a579f8fa8 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -39,6 +39,7 @@ struct fork_event { struct perf_event_header header; u32 pid, ppid; u32 tid, ptid; + u64 time; }; struct lost_event { -- cgit v1.2.3-70-g09d2 From 6161352142d5fed4cd753b32e5ccde66e705b14e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 17 Sep 2009 16:11:28 +0200 Subject: tracing, perf: Convert the power tracer into an event tracer This patch converts the existing power tracer into an event tracer, so that power events (C states and frequency changes) can be tracked via "perf". This also removes the perl script that was used to demo the tracer; its functionality is being replaced entirely with timechart. Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: <20090912130542.6d314860@infradead.org> Signed-off-by: Ingo Molnar --- Documentation/trace/power.txt | 17 --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 7 +- arch/x86/kernel/process.c | 28 ++-- include/trace/events/power.h | 81 +++++++++++ kernel/trace/Makefile | 2 +- kernel/trace/power-traces.c | 20 +++ kernel/trace/trace.h | 3 - kernel/trace/trace_entries.h | 17 --- kernel/trace/trace_power.c | 218 ----------------------------- scripts/tracing/power.pl | 108 -------------- 10 files changed, 113 insertions(+), 388 deletions(-) delete mode 100644 Documentation/trace/power.txt create mode 100644 include/trace/events/power.h create mode 100644 kernel/trace/power-traces.c delete mode 100644 kernel/trace/trace_power.c delete mode 100644 scripts/tracing/power.pl (limited to 'kernel') diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt deleted file mode 100644 index cd805e16dc2..00000000000 --- a/Documentation/trace/power.txt +++ /dev/null @@ -1,17 +0,0 @@ -The power tracer collects detailed information about C-state and P-state -transitions, instead of just looking at the high-level "average" -information. - -There is a helper script found in scrips/tracing/power.pl in the kernel -sources which can be used to parse this information and create a -Scalable Vector Graphics (SVG) picture from the trace data. - -To use this tracer: - - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - echo power > /sys/kernel/debug/tracing/current_tracer - echo 1 > /sys/kernel/debug/tracing/tracing_enabled - sleep 1 - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - cat /sys/kernel/debug/tracing/trace | \ - perl scripts/tracing/power.pl > out.sv diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4109679863c..479cc8c418c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include @@ -72,8 +72,6 @@ static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); static DEFINE_PER_CPU(struct aperfmperf, old_perf); -DEFINE_TRACE(power_mark); - /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -332,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; - struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -364,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_mark(&it, POWER_PSTATE, next_perf_state); + trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 071166a4ba8..7b60e390688 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -299,9 +296,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -314,7 +309,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); + trace_power_end(0); } else { local_irq_enable(); /* loop is done by the caller */ @@ -372,9 +367,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,15 +377,14 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); + trace_power_end(0); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -402,7 +394,7 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); + trace_power_end(0); } else local_irq_enable(); } @@ -414,13 +406,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } /* diff --git a/include/trace/events/power.h b/include/trace/events/power.h new file mode 100644 index 00000000000..ea6d579261a --- /dev/null +++ b/include/trace/events/power.h @@ -0,0 +1,81 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM power + +#if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_POWER_H + +#include +#include + +#ifndef _TRACE_POWER_ENUM_ +#define _TRACE_POWER_ENUM_ +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; +#endif + + + +TRACE_EVENT(power_start, + + TP_PROTO(unsigned int type, unsigned int state), + + TP_ARGS(type, state), + + TP_STRUCT__entry( + __field( u64, type ) + __field( u64, state ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->state = state; + ), + + TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state) +); + +TRACE_EVENT(power_end, + + TP_PROTO(int dummy), + + TP_ARGS(dummy), + + TP_STRUCT__entry( + __field( u64, dummy ) + ), + + TP_fast_assign( + __entry->dummy = 0xffff; + ), + + TP_printk("dummy=%lu", (unsigned long)__entry->dummy) + +); + + +TRACE_EVENT(power_frequency, + + TP_PROTO(unsigned int type, unsigned int state), + + TP_ARGS(type, state), + + TP_STRUCT__entry( + __field( u64, type ) + __field( u64, state ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->state = state; + ), + + TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long) __entry->state) +); + +#endif /* _TRACE_POWER_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90..26f03ac07c2 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o @@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 00000000000..e06c6e3d56a --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven + */ + +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); + diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86bcff94791..405cb850b75 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -37,7 +36,6 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, __TRACE_LAST_TYPE, @@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index a431748ddd6..ead3d724599 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry, F_printk("from: %llx to: %llx", __entry->from, __entry->to) ); -FTRACE_ENTRY(power, trace_power, - - TRACE_POWER, - - F_STRUCT( - __field_struct( struct power_trace, state_data ) - __field_desc( s64, state_data, stamp ) - __field_desc( s64, state_data, end ) - __field_desc( int, state_data, type ) - __field_desc( int, state_data, state ) - ), - - F_printk("%llx->%llx type:%u state:%u", - __entry->stamp, __entry->end, - __entry->type, __entry->state) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index fe1a00f1445..00000000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker - * - */ - -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - tracing_reset_online_cpus(tr); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/scripts/tracing/power.pl b/scripts/tracing/power.pl deleted file mode 100644 index 4f729b3501e..00000000000 --- a/scripts/tracing/power.pl +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2008, Intel Corporation -# -# This file is part of the Linux kernel -# -# This program file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation; version 2 of the License. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -# for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program in a file named COPYING; if not, write to the -# Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, -# Boston, MA 02110-1301 USA -# -# Authors: -# Arjan van de Ven - - -# -# This script turns a cstate ftrace output into a SVG graphic that shows -# historic C-state information -# -# -# cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg -# - -my @styles; -my $base = 0; - -my @pstate_last; -my @pstate_level; - -$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; - - -print " \n"; -print "\n"; - -my $scale = 30000.0; -while (<>) { - my $line = $_; - if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) { - if ($base == 0) { - $base = $1; - } - my $time = $1 - $base; - $time = $time * $scale; - my $C = $2; - my $cpu = $3; - my $y = 400 * $cpu; - my $duration = $4 * $scale; - my $msec = int($4 * 100000)/100.0; - my $height = $C * 20; - $style = $styles[$C]; - - $y = $y + 140 - $height; - - $x2 = $time + 4; - $y2 = $y + 4; - - - print "\n"; - print "C$C $msec\n"; - } - if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) { - my $time = $1 - $base; - my $state = $2; - my $cpu = $3; - - if (defined($pstate_last[$cpu])) { - my $from = $pstate_last[$cpu]; - my $oldstate = $pstate_state[$cpu]; - my $duration = ($time-$from) * $scale; - - $from = $from * $scale; - my $to = $from + $duration; - my $height = 140 - ($oldstate * (140/8)); - - my $y = 400 * $cpu + 200 + $height; - my $y2 = $y+4; - my $style = $styles[8]; - - print "\n"; - print "P$oldstate (cpu $cpu)\n"; - }; - - $pstate_last[$cpu] = $time; - $pstate_state[$cpu] = $state; - } -} - - -print "\n"; -- cgit v1.2.3-70-g09d2 From cdf8073d6b2c6c5a3cd6ce0e6c1297157f7f99ba Mon Sep 17 00:00:00 2001 From: Ian Schram Date: Fri, 18 Sep 2009 21:26:26 +0200 Subject: perf_counter: Fix perf_copy_attr() pointer arithmetic There is still some weird code in per_copy_attr(). Which supposedly checks that all bytes trailing a struct are zero. It doesn't seem to get pointer arithmetic right. Since it increments an iterating pointer by sizeof(unsigned long) rather than 1. Signed-off-by: Ian Schram [ v2: clean up the messy PTR_ALIGN logic as well. ] Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: # for v2.6.31.x LKML-Reference: <4AB3DEE2.3030600@telenet.be> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d5899b62b27..cc768ab81ac 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4208,8 +4208,8 @@ done: static int perf_copy_attr(struct perf_counter_attr __user *uattr, struct perf_counter_attr *attr) { - int ret; u32 size; + int ret; if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) return -EFAULT; @@ -4234,19 +4234,19 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr, /* * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0. + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. */ if (size > sizeof(*attr)) { - unsigned long val; - unsigned long __user *addr; - unsigned long __user *end; + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; - addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr), - sizeof(unsigned long)); - end = PTR_ALIGN((void __user *)uattr + size, - sizeof(unsigned long)); + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; - for (; addr < end; addr += sizeof(unsigned long)) { + for (; addr < end; addr++) { ret = get_user(val, addr); if (ret) return ret; -- cgit v1.2.3-70-g09d2