diff options
author | Borislav Petkov <borislav.petkov@amd.com> | 2010-10-26 20:24:03 +0200 |
---|---|---|
committer | Borislav Petkov <borislav.petkov@amd.com> | 2011-05-03 12:59:43 +0200 |
commit | fae85b7c8bcc7de9c0a2698587e20c15beb7d5a6 (patch) | |
tree | 96ea37ea08d52b2ef89f823f6e43ba8b15cc66bb /kernel/events | |
parent | ac0a3260f37b8616da8d33488ec94b94e6ae5b31 (diff) |
perf: Start the restructuring
mv kernel/perf_event.c -> kernel/events/core.c. From there, all further
sensible splitting can happen. The idea is that due to perf_event.c
becoming pretty sizable and with the advent of the marriage with ftrace,
splitting functionality into its logical parts should help speeding up
the unification and to manage the complexity of the subsystem.
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/Makefile | 5 | ||||
-rw-r--r-- | kernel/events/core.c | 7455 |
2 files changed, 7460 insertions, 0 deletions
diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 00000000000..26c00e4570e --- /dev/null +++ b/kernel/events/Makefile @@ -0,0 +1,5 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_core.o = -pg +endif + +obj-y += core.o diff --git a/kernel/events/core.c b/kernel/events/core.c new file mode 100644 index 00000000000..440bc485bbf --- /dev/null +++ b/kernel/events/core.c @@ -0,0 +1,7455 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/idr.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/reboot.h> +#include <linux/vmstat.h> +#include <linux/device.h> +#include <linux/vmalloc.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> +#include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> + +#include <asm/irq_regs.h> + +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +struct jump_label_key perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ + +/* + * max perf event sample rate + */ +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} + +static atomic64_t perf_event_id; + +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); + +void __weak perf_event_print_debug(void) { } + +extern __weak const char *perf_pmu_name(void) +{ + return "pmu"; +} + +static inline u64 perf_clock(void) +{ + return local_clock(); +} + +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp; + + /* + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) + */ + if (!is_cgroup_event(event)) + return; + + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* must be done before we fput() the file */ + perf_get_cgroup(event); + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } +out: + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} + +void perf_pmu_enable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); +} + +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + raw_spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + + return ctx ? ctx->time : 0; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) + run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = perf_event_time(event); + + event->total_time_running = run_end - event->tstamp_running; + +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; + + /* + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. + */ + if (event->group_leader == event) { + struct list_head *list; + + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } + + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + + list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + event->id_header_size = size; +} + +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader, *pos; + + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) + return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } + + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_rcu(&event->event_entry); + + if (event->group_leader == event) + list_del_init(&event->group_entry); + + update_group_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; +} + +static void perf_group_detach(struct perf_event *event) +{ + struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + goto out; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to whatever list we are on. + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + if (list) + list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; + } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); +} + +static inline int +event_filter_match(struct perf_event *event) +{ + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = tstamp - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = tstamp; + } + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + int state = group_event->state; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static int __perf_remove_from_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + raw_spin_lock(&ctx->lock); + event_sched_out(event, cpuctx, ctx); + list_del_event(event, ctx); + raw_spin_unlock(&ctx->lock); + + return 0; +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always successful. + */ + cpu_function_call(event->cpu, __perf_remove_from_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_remove_from_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. + */ + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to disable a performance event + */ +static int __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + + /* + * If the event is |