aboutsummaryrefslogtreecommitdiff
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2009-09-24 15:13:11 -0700
committerDavid S. Miller <davem@davemloft.net>2009-09-24 15:13:11 -0700
commit8b3f6af86378d0a10ca2f1ded1da124aef13b62c (patch)
treede6ca90295730343c495be8d98be8efa322140ef /kernel/perf_event.c
parent139d6065c83071d5f66cd013a274a43699f8e2c1 (diff)
parent94e0fb086fc5663c38bbc0fe86d698be8314f82f (diff)
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Conflicts: drivers/staging/Kconfig drivers/staging/Makefile drivers/staging/cpc-usb/TODO drivers/staging/cpc-usb/cpc-usb_drv.c drivers/staging/cpc-usb/cpc.h drivers/staging/cpc-usb/cpc_int.h drivers/staging/cpc-usb/cpcusb.h
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c5000
1 files changed, 5000 insertions, 0 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 00000000000..76ac4db405e
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
+/*
+ * Performance events core code:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU events:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_events __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_events __read_mostly;
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+
+/*
+ * perf event paranoia level:
+ * -1 - not paranoid at all
+ * 0 - disallow raw tracepoint access for unpriv
+ * 1 - disallow cpu events for unpriv
+ * 2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+ return sysctl_perf_event_paranoid > -1;
+}
+
+static inline bool perf_paranoid_cpu(void)
+{
+ return sysctl_perf_event_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+ return sysctl_perf_event_paranoid > 1;
+}
+
+int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf event sample rate
+ */
+int sysctl_perf_event_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_event_id;
+
+/*
+ * Lock for (sysadmin-configurable) event reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+{
+ return NULL;
+}
+
+void __weak hw_perf_disable(void) { barrier(); }
+void __weak hw_perf_enable(void) { barrier(); }
+
+void __weak hw_perf_event_setup(int cpu) { barrier(); }
+void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_event *group_leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, int cpu)
+{
+ return 0;
+}
+
+void __weak perf_event_print_debug(void) { }
+
+static DEFINE_PER_CPU(int, perf_disable_count);
+
+void __perf_disable(void)
+{
+ __get_cpu_var(perf_disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+ return !--__get_cpu_var(perf_disable_count);
+}
+
+void perf_disable(void)
+{
+ __perf_disable();
+ hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+ if (__perf_enable())
+ hw_perf_enable();
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_event_context *ctx;
+
+ ctx = container_of(head, struct perf_event_context, rcu_head);
+ kfree(ctx);
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ if (ctx->parent_ctx)
+ put_ctx(ctx->parent_ctx);
+ if (ctx->task)
+ put_task_struct(ctx->task);
+ call_rcu(&ctx->rcu_head, free_ctx);
+ }
+}
+
+static void unclone_ctx(struct perf_event_context *ctx)
+{
+ if (ctx->parent_ctx) {
+ put_ctx(ctx->parent_ctx);
+ ctx->parent_ctx = NULL;
+ }
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+ u64 id = event->id;
+
+ if (event->parent)
+ id = event->parent->id;
+
+ return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+ struct perf_event_context *ctx;
+
+ rcu_read_lock();
+ retry:
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_event_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ goto retry;
+ }
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ put_ctx(ctx);
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event *group_leader = event->group_leader;
+
+ /*
+ * Depending on whether it is a standalone or sibling event,
+ * add it straight to the context's event list, or to the group
+ * leader's sibling list:
+ */
+ if (group_leader == event)
+ list_add_tail(&event->group_entry, &ctx->group_list);
+ else {
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+ }
+
+ list_add_rcu(&event->event_entry, &ctx->event_list);
+ ctx->nr_events++;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat++;
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event *sibling, *tmp;
+
+ if (list_empty(&event->group_entry))
+ return;
+ ctx->nr_events--;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat--;
+
+ list_del_init(&event->group_entry);
+ list_del_rcu(&event->event_entry);
+
+ if (event->group_leader != event)
+ event->group_leader->nr_siblings--;
+
+ /*
+ * If this was a group event with sibling events then
+ * upgrade the siblings to singleton events by adding them
+ * to the context list directly:
+ */
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+
+ list_move_tail(&sibling->group_entry, &ctx->group_list);
+ sibling->group_leader = sibling;
+ }
+}
+
+static void
+event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+ event->tstamp_stopped = ctx->time;
+ event->pmu->disable(event);
+ event->oncpu = -1;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event;
+
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event_sched_out(group_event, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ event_sched_out(event, cpuctx, ctx);
+
+ if (group_event->attr.exclusive)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_event_remove_from_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+ /*
+ * Protect the list operation against NMI by disabling the
+ * events on a global level.
+ */
+ perf_disable();
+
+ event_sched_out(event, cpuctx, ctx);
+
+ list_del_event(event, ctx);
+
+ if (!ctx->task) {
+ /*
+ * Allow more per task events with respect to the
+ * reservation:
+ */
+ cpuctx->max_pertask =
+ min(perf_max_events - ctx->nr_events,
+ perf_max_events - perf_reserved_percpu);
+ }
+
+ perf_enable();
+ spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_event_remove_from_context(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu events are removed via an smp call and
+ * the removal is always sucessful.
+ */
+ smp_call_function_single(event->cpu,
+ __perf_event_remove_from_context,
+ event, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_event_remove_from_context,
+ event);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the context is active we need to retry the smp call.
+ */
+ if (ctx->nr_active && !list_empty(&event->group_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can remove the event safely, if the call above did not
+ * succeed.
+ */
+ if (!list_empty(&event->group_entry)) {
+ list_del_event(event, ctx);
+ }
+ spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+ return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ u64 run_end;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE ||
+ event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+ return;
+
+ event->total_time_enabled = ctx->time - event->tstamp_enabled;
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ run_end = event->tstamp_stopped;
+ else
+ run_end = ctx->time;
+
+ event->total_time_running = run_end - event->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ update_event_times(leader);
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ update_event_times(event);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+static void __perf_event_disable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * If this is a per-task event, need to check whether this
+ * event's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+
+ /*
+ * If the event is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ update_context_time(ctx);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+static void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the event on the cpu that it's on
+ */
+ smp_call_function_single(event->cpu, __perf_event_disable,
+ event, 1);
+ return;
+ }
+
+ retry:
+ task_oncpu_function_call(task, __perf_event_disable, event);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the event is still active, we need to retry the cross-call.
+ */
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int
+event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ int cpu)
+{
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ if (event->pmu->enable(event)) {
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
+ return -EAGAIN;
+ }
+
+ event->tstamp_running += ctx->time - event->tstamp_stopped;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+
+ if (event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return 0;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ int cpu)
+{
+ struct perf_event *event, *partial_group;
+ int ret;
+
+ if (group_event->state == PERF_EVENT_STATE_OFF)
+ return 0;
+
+ ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ if (event_sched_in(group_event, cpuctx, ctx, cpu))
+ return -EAGAIN;
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event_sched_in(event, cpuctx, ctx, cpu)) {
+ partial_group = event;
+ goto group_error;
+ }
+ }
+
+ return 0;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event == partial_group)
+ break;
+ event_sched_out(event, cpuctx, ctx);
+ }
+ event_sched_out(group_event, cpuctx, ctx);
+
+ return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software events,
+ * 0 if the group contains any hardware events.
+ */
+static int is_software_only_group(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ if (!is_software_event(leader))
+ return 0;
+
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ if (!is_software_event(event))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software events can always go on.
+ */
+ if (is_software_only_group(event))
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * events can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * events on the CPU, it can't go on.
+ */
+ if (event->attr.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ list_add_event(event, ctx);
+ event->tstamp_enabled = ctx->time;
+ event->tstamp_running = ctx->time;
+ event->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *leader = event->group_leader;
+ int cpu = smp_processor_id();
+ int err;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ * Or possibly this is the right context but it isn't
+ * on this cpu because it had no events.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ /*
+ * Protect the list operation against NMI by disabling the
+ * events on a global level. NOP for non NMI based events.
+ */
+ perf_disable();
+
+ add_event_to_ctx(event, ctx);
+
+ /*
+ * Don't put the event on if it is disabled or if
+ * it is in a group and the group isn't on.
+ */
+ if (event->state != PERF_EVENT_STATE_INACTIVE ||
+ (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
+ goto unlock;
+
+ /*
+ * An exclusive event can't go on if there are already active
+ * hardware events, and no hardware event can go on if there
+ * is already an exclusive event on.
+ */
+ if (!group_can_go_on(event, cpuctx, 1))
+ err = -EEXIST;
+ else
+ err = event_sched_in(event, cpuctx, ctx, cpu);
+
+ if (err) {
+ /*
+ * This event couldn't go on. If it is in a group
+ * then we have to pull the whole group off.
+ * If the event group is pinned then put it in error state.
+ */
+ if (leader != event)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+
+ if (!err && !ctx->task && cpuctx->max_pertask)
+ cpuctx->max_pertask--;
+
+ unlock:
+ perf_enable();
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+ struct perf_event *event,
+ int cpu)
+{
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu events are installed via an smp call and
+ * the install is always sucessful.
+ */
+ smp_call_function_single(cpu, __perf_install_in_context,
+ event, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_install_in_context,
+ event);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * we need to retry the smp call.
+ */
+ if (ctx->is_active && list_empty(&event->group_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can add the event safely, if it the call above did not
+ * succeed.
+ */
+ if (list_empty(&event->group_entry))
+ add_event_to_ctx(event, ctx);
+ spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry)
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled =
+ ctx->time - sub->total_time_enabled;
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static void __perf_event_enable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *leader = event->group_leader;
+ int err;
+
+ /*
+ * If this is a per-task event, need to check whether this
+ * event's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto unlock;
+ __perf_event_mark_enabled(event, ctx);
+
+ /*
+ * If the event is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(event, cpuctx, 1)) {
+ err = -EEXIST;
+ } else {
+ perf_disable();
+ if (event == leader)
+ err = group_sched_in(event, cpuctx, ctx,
+ smp_processor_id());
+ else
+ err = event_sched_in(event, cpuctx, ctx,
+ smp_processor_id());
+ perf_enable();
+ }
+
+ if (err) {
+ /*
+ * If this event can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != event)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+
+ unlock:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+static void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Enable the event on the cpu that it's on
+ */
+ smp_call_function_single(event->cpu, __perf_event_enable,
+ event, 1);
+ return;
+ }
+
+ spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto out;
+
+ /*
+ * If the event is in error state, clear that first.
+ * That way, if we see the event in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
+ */
+ if (event->state == PERF_EVENT_STATE_ERROR)
+ event->state = PERF_EVENT_STATE_OFF;
+
+ retry:
+ spin_unlock_irq(&ctx->lock);
+ task_oncpu_function_call(task, __perf_event_enable, event);
+
+ spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the event is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+ goto retry;
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (event->state == PERF_EVENT_STATE_OFF)
+ __perf_event_mark_enabled(event, ctx);
+
+ out:
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ /*
+ * not supported on inherited events
+ */
+ if (event->attr.inherit)
+ return -EINVAL;
+
+ atomic_add(refresh, &event->event_limit);
+ perf_event_enable(event);
+
+ return 0;
+}
+
+void __perf_event_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 0;
+ if (likely(!ctx->nr_events))
+ goto out;
+ update_context_time(ctx);
+
+ perf_disable();
+ if (ctx->nr_active) {
+ list_for_each_entry(event, &ctx->group_list, group_entry) {
+ if (event != event->group_leader)
+ event_sched_out(event, cpuctx, ctx);
+ else
+ group_sched_out(event, cpuctx, ctx);
+ }
+ }
+ perf_enable();
+ out:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled events.
+ * If the number of enabled events is the same, then the set
+ * of enabled events should be the same, because these are both
+ * inherited contexts, therefore we can't access individual events
+ * in them directly with an fd; we can only enable/disable all
+ * events via prctl, or enable/disable all events in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+ struct perf_event_context *ctx2)
+{
+ return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+ && ctx1->parent_gen == ctx2->parent_gen
+ && !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_event_read(void *event);
+
+static void __perf_event_sync_stat(struct perf_event *event,
+ struct perf_event *next_event)
+{
+ u64 value;
+
+ if (!event->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the event value, we cannot use perf_event_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the event must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (event->state) {
+ case PERF_EVENT_STATE_ACTIVE:
+ __perf_event_read(event);
+ break;
+
+ case PERF_EVENT_STATE_INACTIVE:
+ update_event_times(event);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the event
+ * values when we flip the contexts.
+ */
+ value = atomic64_read(&next_event->count);
+ value = atomic64_xchg(&event->count, value);
+ atomic64_set(&next_event->count, value);
+
+ swap(event->total_time_enabled, next_event->total_time_enabled);
+ swap(event->total_time_running, next_event->total_time_running);
+
+ /*
+ * Since we swizzled the values, update the user visible data too.
+ */
+ perf_event_update_userpage(event);
+ perf_event_update_userpage(next_event);
+}
+
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event *event, *next_event;
+
+ if (!ctx->nr_stat)
+ return;
+
+ event = list_first_entry(&ctx->event_list,
+ struct perf_event, event_entry);
+
+ next_event = list_first_entry(&next_ctx->event_list,
+ struct perf_event, event_entry);
+
+ while (&event->event_entry != &ctx->event_list &&
+ &next_event->event_entry != &next_ctx->event_list) {
+
+ __perf_event_sync_stat(event, next_event);
+
+ event = list_next_entry(event, event_entry);
+ next_event = list_next_entry(next_event, event_entry);
+ }
+}
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+ struct task_struct *next, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_event_context *ctx = task->perf_event_ctxp;
+ struct perf_event_context *next_ctx;
+ struct perf_event_context *parent;
+ struct pt_regs *regs;
+ int do_switch = 1;
+
+ regs = task_pt_regs(task);
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+ if (likely(!ctx || !cpuctx->task_ctx))
+ return;
+
+ update_context_time(ctx);
+
+ rcu_read_lock();
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_ctx = next->perf_event_ctxp;
+ if (parent && next_ctx &&
+ rcu_dereference(next_ctx->parent_ctx) == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither ha