aboutsummaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c217
1 files changed, 199 insertions, 18 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 94afe5b91c6..a6a9ec4cd8f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP)
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+ (PERF_SAMPLE_BRANCH_KERNEL |\
+ PERF_SAMPLE_BRANCH_HV)
+
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event))
ctx->nr_cgroups++;
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
cpuctx->cgrp = NULL;
}
+ if (has_branch_stack(event))
+ ctx->nr_branch_stack--;
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
}
/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /* no need to flush branch stack if not changing task */
+ if (prev == task)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ /*
+ * check if the context has at least one
+ * event using PERF_SAMPLE_BRANCH_STACK
+ */
+ if (cpuctx->ctx.nr_branch_stack > 0
+ && pmu->flush_branch_stack) {
+
+ pmu = cpuctx->ctx.pmu;
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->flush_branch_stack();
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
*
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
*/
if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
+
+ /* check for system-wide branch_stack events */
+ if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+ perf_branch_stack_sched_in(prev, task);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec_deferred(&perf_sched_events);
+ static_key_slow_dec_deferred(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
put_callchain_buffers();
if (is_cgroup_event(event)) {
atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_dec_deferred(&perf_sched_events);
+ static_key_slow_dec_deferred(&perf_sched_events);
+ }
+
+ if (has_branch_stack(event)) {
+ static_key_slow_dec_deferred(&perf_sched_events);
+ /* is system-wide event */
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_dec(&per_cpu(perf_branch_stack_events,
+ event->cpu));
}
}
@@ -3262,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event,
*running = ctx_time - event->tstamp_running;
}
-void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -3312,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
- perf_update_user_clock(userpg, now);
+ arch_perf_update_userpage(userpg, now);
barrier();
++userpg->lock;
@@ -3907,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
}
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
+ size_t size;
+
+ size = data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+
+ perf_output_put(handle, data->br_stack->nr);
+ perf_output_copy(handle, data->br_stack->entries, size);
+ } else {
+ /*
+ * we always store at least the value of nr
+ */
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -3949,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
WARN_ON_ONCE(size & (sizeof(u64)-1));
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ int size = sizeof(u64); /* nr */
+ if (data->br_stack) {
+ size += data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+ }
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -4991,7 +5104,7 @@ fail:
return err;
}
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
static void sw_perf_event_destroy(struct perf_event *event)
{
@@ -4999,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
- jump_label_dec(&perf_swevent_enabled[event_id]);
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
}
@@ -5010,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
case PERF_COUNT_SW_TASK_CLOCK:
@@ -5029,7 +5148,7 @@ static int perf_swevent_init(struct perf_event *event)
if (err)
return err;
- jump_label_inc(&perf_swevent_enabled[event_id]);
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
}
@@ -5120,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
err = perf_trace_init(event);
if (err)
return err;
@@ -5345,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
perf_swevent_init_hrtimer(event);
return 0;
@@ -5419,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
perf_swevent_init_hrtimer(event);
return 0;
@@ -5852,7 +5989,7 @@ done:
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_sched_events.key);
+ static_key_slow_inc(&perf_sched_events.key);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -5866,6 +6003,12 @@ done:
return ERR_PTR(err);
}
}
+ if (has_branch_stack(event)) {
+ static_key_slow_inc(&perf_sched_events.key);
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_inc(&per_cpu(perf_branch_stack_events,
+ event->cpu));
+ }
}
return event;
@@ -5935,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->read_format & ~(PERF_FORMAT_MAX-1))
return -EINVAL;
+ if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ u64 mask = attr->branch_sample_type;
+
+ /* only using defined bits */
+ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+ return -EINVAL;
+
+ /* at least one branch bit must be set */
+ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+ return -EINVAL;
+
+ /* kernel level capture: check permissions */
+ if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ /* propagate priv level, when not set for branch */
+ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+ /* exclude_kernel checked on syscall entry */
+ if (!attr->exclude_kernel)
+ mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ if (!attr->exclude_user)
+ mask |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!attr->exclude_hv)
+ mask |= PERF_SAMPLE_BRANCH_HV;
+ /*
+ * adjust user setting (for HW filter setup)
+ */
+ attr->branch_sample_type = mask;
+ }
+ }
out:
return ret;
@@ -6090,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
* - that may need work on context switch
*/
atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_inc(&perf_sched_events.key);
+ static_key_slow_inc(&perf_sched_events.key);
}
/*
@@ -6939,6 +7116,13 @@ void __init perf_event_init(void)
/* do not patch jump label more than once per second */
jump_label_rate_limit(&perf_sched_events, HZ);
+
+ /*
+ * Build time assertion that we keep the data_head at the intended
+ * location. IOW, validation we got the __reserved[] size right.
+ */
+ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+ != 1024);
}
static int __init perf_event_sysfs_init(void)
@@ -6970,8 +7154,7 @@ unlock:
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
- struct cgroup_subsys *ss, struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
{
struct perf_cgroup *jc;
@@ -6988,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
return &jc->css;
}
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
- struct cgroup *cont)
+static void perf_cgroup_destroy(struct cgroup *cont)
{
struct perf_cgroup *jc;
jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7005,8 +7187,7 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct task_struct *task;
@@ -7014,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task)
+static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.