aboutsummaryrefslogtreecommitdiff
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-21 12:02:48 +0200
committerIngo Molnar <mingo@elte.hu>2009-09-21 14:28:04 +0200
commitcdd6c482c9ff9c55475ee7392ec8f672eddb7be6 (patch)
tree81f98a3ab46c589792057fe2392c1e10f8ad7893 /kernel/perf_event.c
parentdfc65094d0313cc48969fa60bcf33d693aeb05a7 (diff)
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\<event\>/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Reviewed-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c5000
1 files changed, 5000 insertions, 0 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 00000000000..6e8b99a04e1
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
+/*
+ * Performance event core code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU events:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_events __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_events __read_mostly;
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+
+/*
+ * perf event paranoia level:
+ * -1 - not paranoid at all
+ * 0 - disallow raw tracepoint access for unpriv
+ * 1 - disallow cpu events for unpriv
+ * 2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+ return sysctl_perf_event_paranoid > -1;
+}
+
+static inline bool perf_paranoid_cpu(void)
+{
+ return sysctl_perf_event_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+ return sysctl_perf_event_paranoid > 1;
+}
+
+int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf event sample rate
+ */
+int sysctl_perf_event_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_event_id;
+
+/*
+ * Lock for (sysadmin-configurable) event reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+ return NULL;
+}
+
+void __weak hw_perf_disable(void) { barrier(); }
+void __weak hw_perf_enable(void) { barrier(); }
+
+void __weak hw_perf_event_setup(int cpu) { barrier(); }
+void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_event *group_leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, int cpu)
+{
+ return 0;
+}
+
+void __weak perf_event_print_debug(void) { }
+
+static DEFINE_PER_CPU(int, perf_disable_count);
+
+void __perf_disable(void)
+{
+ __get_cpu_var(perf_disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+ return !--__get_cpu_var(perf_disable_count);
+}
+
+void perf_disable(void)
+{
+ __perf_disable();
+ hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+ if (__perf_enable())
+ hw_perf_enable();
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_event_context *ctx;
+
+ ctx = container_of(head, struct perf_event_context, rcu_head);
+ kfree(ctx);
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ if (ctx->parent_ctx)
+ put_ctx(ctx->parent_ctx);
+ if (ctx->task)
+ put_task_struct(ctx->task);
+ call_rcu(&ctx->rcu_head, free_ctx);
+ }
+}
+
+static void unclone_ctx(struct perf_event_context *ctx)
+{
+ if (ctx->parent_ctx) {
+ put_ctx(ctx->parent_ctx);
+ ctx->parent_ctx = NULL;
+ }
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+ u64 id = event->id;
+
+ if (event->parent)
+ id = event->parent->id;
+
+ return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+ struct perf_event_context *ctx;
+
+ rcu_read_lock();
+ retry:
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_event_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ goto retry;
+ }
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ put_ctx(ctx);
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event *group_leader = event->group_leader;
+
+ /*
+ * Depending on whether it is a standalone or sibling event,
+ * add it straight to the context's event list, or to the group
+ * leader's sibling list:
+ */
+ if (group_leader == event)
+ list_add_tail(&event->group_entry, &ctx->group_list);
+ else {
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+ }
+
+ list_add_rcu(&event->event_entry, &ctx->event_list);
+ ctx->nr_events++;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat++;
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event *sibling, *tmp;
+
+ if (list_empty(&event->group_entry))
+ return;
+ ctx->nr_events--;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat--;
+
+ list_del_init(&event->group_entry);
+ list_del_rcu(&event->event_entry);
+
+ if (event->group_leader != event)
+ event->group_leader->nr_siblings--;
+
+ /*
+ * If this was a group event with sibling events then
+ * upgrade the siblings to singleton events by adding them
+ * to the context list directly:
+ */
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+
+ list_move_tail(&sibling->group_entry, &ctx->group_list);
+ sibling->group_leader = sibling;
+ }
+}
+
+static void
+event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+ event->tstamp_stopped = ctx->time;
+ event->pmu->disable(event);
+ event->oncpu = -1;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event;
+
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event_sched_out(group_event, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ event_sched_out(event, cpuctx, ctx);
+
+ if (group_event->attr.exclusive)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_event_remove_from_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+ /*
+ * Protect the list operation against NMI by disabling the
+ * events on a global level.
+ */
+ perf_disable();
+
+ event_sched_out(event, cpuctx, ctx);
+
+ list_del_event(event, ctx);
+
+ if (!ctx->task) {
+ /*
+ * Allow more per task events with respect to the
+ * reservation:
+ */
+ cpuctx->max_pertask =
+ min(perf_max_events - ctx->nr_events,
+ perf_max_events - perf_reserved_percpu);
+ }
+
+ perf_enable();
+ spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_event_remove_from_context(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu events are removed via an smp call and
+ * the removal is always sucessful.
+ */
+ smp_call_function_single(event->cpu,
+ __perf_event_remove_from_context,
+ event, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_event_remove_from_context,
+ event);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the context is active we need to retry the smp call.
+ */
+ if (ctx->nr_active && !list_empty(&event->group_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can remove the event safely, if the call above did not
+ * succeed.
+ */
+ if (!list_empty(&event->group_entry)) {
+ list_del_event(event, ctx);
+ }
+ spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+ return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ u64 run_end;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE ||
+ event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+ return;
+
+ event->total_time_enabled = ctx->time - event->tstamp_enabled;
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ run_end = event->tstamp_stopped;
+ else
+ run_end = ctx->time;
+
+ event->total_time_running = run_end - event->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ update_event_times(leader);
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ update_event_times(event);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+static void __perf_event_disable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * If this is a per-task event, need to check whether this
+ * event's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+
+ /*
+ * If the event is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ update_context_time(ctx);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+static void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the event on the cpu that it's on
+ */
+ smp_call_function_single(event->cpu, __perf_event_disable,
+ event, 1);
+ return;
+ }
+
+ retry:
+ task_oncpu_function_call(task, __perf_event_disable, event);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the event is still active, we need to retry the cross-call.
+ */
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int
+event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ int cpu)
+{
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ if (event->pmu->enable(event)) {
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
+ return -EAGAIN;
+ }
+
+ event->tstamp_running += ctx->time - event->tstamp_stopped;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+
+ if (event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return 0;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ int cpu)
+{
+ struct perf_event *event, *partial_group;
+ int ret;
+
+ if (group_event->state == PERF_EVENT_STATE_OFF)
+ return 0;
+
+ ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ if (event_sched_in(group_event, cpuctx, ctx, cpu))
+ return -EAGAIN;
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event_sched_in(event, cpuctx, ctx, cpu)) {
+ partial_group = event;
+ goto group_error;
+ }
+ }
+
+ return 0;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event == partial_group)
+ break;
+ event_sched_out(event, cpuctx, ctx);
+ }
+ event_sched_out(group_event, cpuctx, ctx);
+
+ return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software events,
+ * 0 if the group contains any hardware events.
+ */
+static int is_software_only_group(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ if (!is_software_event(leader))
+ return 0;
+
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ if (!is_software_event(event))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software events can always go on.
+ */
+ if (is_software_only_group(event))
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * events can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * events on the CPU, it can't go on.
+ */
+ if (event->attr.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ list_add_event(event, ctx);
+ event->tstamp_enabled = ctx->time;
+ event->tstamp_running = ctx->time;
+ event->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *leader = event->group_leader;
+ int cpu = smp_processor_id();
+ int err;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ * Or possibly this is the right context but it isn't
+ * on this cpu because it had no events.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ /*
+ * Protect the list operation against NMI by disabling the
+ * events on a global level. NOP for non NMI based events.
+ */
+ perf_disable();
+
+ add_event_to_ctx(event, ctx);
+
+ /*
+ * Don't put the event on if it is disabled or if
+ * it is in a group and the group isn't on.
+ */
+ if (event->state != PERF_EVENT_STATE_INACTIVE ||
+ (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
+ goto unlock;
+
+ /*
+ * An exclusive event can't go on if there are already active
+ * hardware events, and no hardware event can go on if there
+ * is already an exclusive event on.
+ */
+ if (!group_can_go_on(event, cpuctx, 1))
+ err = -EEXIST;
+ else
+ err = event_sched_in(event, cpuctx, ctx, cpu);
+
+ if (err) {
+ /*
+ * This event couldn't go on. If it is in a group
+ * then we have to pull the whole group off.
+ * If the event group is pinned then put it in error state.
+ */
+ if (leader != event)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+
+ if (!err && !ctx->task && cpuctx->max_pertask)
+ cpuctx->max_pertask--;
+
+ unlock:
+ perf_enable();
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+ struct perf_event *event,
+ int cpu)
+{
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu events are installed via an smp call and
+ * the install is always sucessful.
+ */
+ smp_call_function_single(cpu, __perf_install_in_context,
+ event, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_install_in_context,
+ event);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * we need to retry the smp call.
+ */
+ if (ctx->is_active && list_empty(&event->group_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can add the event safely, if it the call above did not
+ * succeed.
+ */
+ if (list_empty(&event->group_entry))
+ add_event_to_ctx(event, ctx);
+ spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry)
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled =
+ ctx->time - sub->total_time_enabled;
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static void __perf_event_enable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *leader = event->group_leader;
+ int err;
+
+ /*
+ * If this is a per-task event, need to check whether this
+ * event's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto unlock;
+ __perf_event_mark_enabled(event, ctx);
+
+ /*
+ * If the event is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(event, cpuctx, 1)) {
+ err = -EEXIST;
+ } else {
+ perf_disable();
+ if (event == leader)
+ err = group_sched_in(event, cpuctx, ctx,
+ smp_processor_id());
+ else
+ err = event_sched_in(event, cpuctx, ctx,
+ smp_processor_id());
+ perf_enable();
+ }
+
+ if (err) {
+ /*
+ * If this event can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != event)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+
+ unlock:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+static void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Enable the event on the cpu that it's on
+ */
+ smp_call_function_single(event->cpu, __perf_event_enable,
+ event, 1);
+ return;
+ }
+
+ spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto out;
+
+ /*
+ * If the event is in error state, clear that first.
+ * That way, if we see the event in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
+ */
+ if (event->state == PERF_EVENT_STATE_ERROR)
+ event->state = PERF_EVENT_STATE_OFF;
+
+ retry:
+ spin_unlock_irq(&ctx->lock);
+ task_oncpu_function_call(task, __perf_event_enable, event);
+
+ spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the event is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+ goto retry;
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (event->state == PERF_EVENT_STATE_OFF)
+ __perf_event_mark_enabled(event, ctx);
+
+ out:
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ /*
+ * not supported on inherited events
+ */
+ if (event->attr.inherit)
+ return -EINVAL;
+
+ atomic_add(refresh, &event->event_limit);
+ perf_event_enable(event);
+
+ return 0;
+}
+
+void __perf_event_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 0;
+ if (likely(!ctx->nr_events))
+ goto out;
+ update_context_time(ctx);
+
+ perf_disable();
+ if (ctx->nr_active) {
+ list_for_each_entry(event, &ctx->group_list, group_entry) {
+ if (event != event->group_leader)
+ event_sched_out(event, cpuctx, ctx);
+ else
+ group_sched_out(event, cpuctx, ctx);
+ }
+ }
+ perf_enable();
+ out:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled events.
+ * If the number of enabled events is the same, then the set
+ * of enabled events should be the same, because these are both
+ * inherited contexts, therefore we can't access individual events
+ * in them directly with an fd; we can only enable/disable all
+ * events via prctl, or enable/disable all events in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+ struct perf_event_context *ctx2)
+{
+ return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+ && ctx1->parent_gen == ctx2->parent_gen
+ && !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_event_read(void *event);
+
+static void __perf_event_sync_stat(struct perf_event *event,
+ struct perf_event *next_event)
+{
+ u64 value;
+
+ if (!event->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the event value, we cannot use perf_event_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the event must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (event->state) {
+ case PERF_EVENT_STATE_ACTIVE:
+ __perf_event_read(event);
+ break;
+
+ case PERF_EVENT_STATE_INACTIVE:
+ update_event_times(event);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the event
+ * values when we flip the contexts.
+ */
+ value = atomic64_read(&next_event->count);
+ value = atomic64_xchg(&event->count, value);
+ atomic64_set(&next_event->count, value);
+
+ swap(event->total_time_enabled, next_event->total_time_enabled);
+ swap(event->total_time_running, next_event->total_time_running);
+
+ /*
+ * Since we swizzled the values, update the user visible data too.
+ */
+ perf_event_update_userpage(event);
+ perf_event_update_userpage(next_event);
+}
+
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event *event, *next_event;
+
+ if (!ctx->nr_stat)
+ return;
+
+ event = list_first_entry(&ctx->event_list,
+ struct perf_event, event_entry);
+
+ next_event = list_first_entry(&next_ctx->event_list,
+ struct perf_event,