aboutsummaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c1515
-rw-r--r--kernel/events/hw_breakpoint.c193
-rw-r--r--kernel/events/internal.h39
-rw-r--r--kernel/events/ring_buffer.c136
-rw-r--r--kernel/events/uprobes.c378
6 files changed, 1523 insertions, 741 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c77206184b8..97b67df8fbf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
err = alloc_callchain_buffers();
exit:
+ if (err)
+ atomic_dec(&nr_callchain_events);
+
mutex_unlock(&callchain_mutex);
return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c..6b17ac1b0c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,8 @@
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/mman.h>
#include "internal.h"
@@ -119,7 +121,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
- PERF_FLAG_PID_CGROUP)
+ PERF_FLAG_PID_CGROUP |\
+ PERF_FLAG_FD_CLOEXEC)
/*
* branch priv levels that need permission checks
@@ -145,6 +148,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
+static atomic_t nr_freq_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -165,25 +169,130 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
/*
* max perf event sample rate
*/
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
- DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT 25
+
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+void update_perf_cpu_limits(void)
+{
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+ do_div(tmp, 100);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+}
+
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ update_perf_cpu_limits();
return 0;
}
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done. This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+static DEFINE_PER_CPU(u64, running_sample_length);
+
+static void perf_duration_warn(struct irq_work *w)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ local_samples_len = __get_cpu_var(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ if (allowed_ns == 0)
+ return;
+
+ /* decay the counter by 1 average sample */
+ local_samples_len = __get_cpu_var(running_sample_length);
+ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+ local_samples_len += sample_len_ns;
+ __get_cpu_var(running_sample_length) = local_samples_len;
+
+ /*
+ * note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * from having to maintain a count.
+ */
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ if (avg_local_sample_len <= allowed_ns)
+ return;
+
+ if (max_samples_per_tick <= 1)
+ return;
+
+ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+ update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
+}
+
static atomic64_t perf_event_id;
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -196,9 +305,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
-
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -257,8 +363,8 @@ struct perf_cgroup {
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task)
{
- return container_of(task_subsys_state(task, perf_subsys_id),
- struct perf_cgroup, css);
+ return container_of(task_css(task, perf_event_cgrp_id),
+ struct perf_cgroup, css);
}
static inline bool
@@ -285,11 +391,6 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline bool perf_tryget_cgroup(struct perf_event *event)
-{
- return css_tryget(&event->cgrp->css);
-}
-
static inline void perf_put_cgroup(struct perf_event *event)
{
css_put(&event->cgrp->css);
@@ -508,7 +609,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- css = cgroup_css_from_dir(f.file, perf_subsys_id);
+ css = css_tryget_online_from_dir(f.file->f_dentry,
+ &perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
@@ -517,13 +619,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
- /* must be done before we fput() the file */
- if (!perf_tryget_cgroup(event)) {
- event->cgrp = NULL;
- ret = -ENOENT;
- goto out;
- }
-
/*
* all events in a group must monitor
* the same cgroup because a task belongs
@@ -658,6 +753,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
#endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+ struct perf_cpu_context *cpuctx;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ int rotations = 0;
+
+ WARN_ON(!irqs_disabled());
+
+ cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+ rotations = perf_rotate_context(cpuctx);
+
+ /*
+ * arm timer if needed
+ */
+ if (rotations) {
+ hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ ret = HRTIMER_RESTART;
+ }
+
+ return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ continue;
+
+ hrtimer_cancel(&cpuctx->hrtimer);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+ int timer;
+
+ /* no multiplexing needed for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ /*
+ * check default is sane, if not set then force to
+ * default interval (1/tick)
+ */
+ timer = pmu->hrtimer_interval_ms;
+ if (timer < 1)
+ timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* not for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ if (hrtimer_active(hr))
+ return;
+
+ if (!hrtimer_callback_running(hr))
+ __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+ 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -686,12 +881,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list)) {
- int was_empty = list_empty(head);
+ if (list_empty(&cpuctx->rotation_list))
list_add(&cpuctx->rotation_list, head);
- if (was_empty)
- tick_nohz_full_kick();
- }
}
static void get_ctx(struct perf_event_context *ctx)
@@ -716,6 +907,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
+ ctx->generation++;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -764,8 +956,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
- rcu_read_lock();
retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
@@ -781,6 +983,8 @@ retry:
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
goto retry;
}
@@ -790,6 +994,7 @@ retry:
}
}
rcu_read_unlock();
+ preempt_enable();
return ctx;
}
@@ -940,6 +1145,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+
+ ctx->generation++;
}
/*
@@ -1005,6 +1212,9 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
event->header_size = size;
}
@@ -1020,6 +1230,9 @@ static void perf_event__id_header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ size += sizeof(data->id);
+
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
@@ -1111,6 +1324,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
@@ -1189,6 +1404,8 @@ event_sched_out(struct perf_event *event,
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
+ perf_pmu_disable(event->pmu);
+
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
@@ -1205,6 +1422,8 @@ event_sched_out(struct perf_event *event,
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+
+ perf_pmu_enable(event->pmu);
}
static void
@@ -1227,6 +1446,11 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
/*
* Cross CPU call to remove a performance event
*
@@ -1235,12 +1459,15 @@ group_sched_out(struct perf_event *group_event,
*/
static int __perf_remove_from_context(void *info)
{
- struct perf_event *event = info;
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
@@ -1265,10 +1492,14 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
lockdep_assert_held(&ctx->mutex);
@@ -1277,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event)
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, event);
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
- if (!task_function_call(task, __perf_remove_from_context, event))
+ if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -1299,6 +1530,8 @@ retry:
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
+ if (detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -1445,6 +1678,9 @@ event_sched_in(struct perf_event *event,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
+ int ret = 0;
+
+ lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -1467,10 +1703,13 @@ event_sched_in(struct perf_event *event,
*/
smp_wmb();
+ perf_pmu_disable(event->pmu);
+
if (event->pmu->add(event, PERF_EF_START)) {
event->state = PERF_EVENT_STATE_INACTIVE;
event->oncpu = -1;
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
event->tstamp_running += tstamp - event->tstamp_stopped;
@@ -1486,7 +1725,10 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- return 0;
+out:
+ perf_pmu_enable(event->pmu);
+
+ return ret;
}
static int
@@ -1495,7 +1737,7 @@ group_sched_in(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = group_event->pmu;
+ struct pmu *pmu = ctx->pmu;
u64 now = ctx->time;
bool simulate = false;
@@ -1506,6 +1748,7 @@ group_sched_in(struct perf_event *group_event,
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -1552,6 +1795,8 @@ group_error:
pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
+
return -EAGAIN;
}
@@ -1764,7 +2009,16 @@ static int __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- if (WARN_ON_ONCE(!ctx->is_active))
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -1807,8 +2061,10 @@ static int __perf_event_enable(void *info)
* If this event can't go on and it's part of a
* group, then the whole group has to come off.
*/
- if (leader != event)
+ if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
+ perf_cpu_hrtimer_restart(cpuctx);
+ }
if (leader->attr.pinned) {
update_group_times(leader);
leader->state = PERF_EVENT_STATE_ERROR;
@@ -1933,22 +2189,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
}
/*
- * Test whether two contexts are equivalent, i.e. whether they
- * have both been cloned from the same version of the same context
- * and they both have the same number of enabled events.
- * If the number of enabled events is the same, then the set
- * of enabled events should be the same, because these are both
- * inherited contexts, therefore we can't access individual events
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
- return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
- && ctx1->parent_gen == ctx2->parent_gen
- && !ctx1->pin_count && !ctx2->pin_count;
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
}
static void __perf_event_sync_stat(struct perf_event *event,
@@ -1997,9 +2269,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
perf_event_update_userpage(next_event);
}
-#define list_next_entry(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
@@ -2031,7 +2300,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
- struct perf_event_context *parent;
+ struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
@@ -2043,10 +2312,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
return;
rcu_read_lock();
- parent = rcu_dereference(ctx->parent_ctx);
next_ctx = next->perf_event_ctxp[ctxn];
- if (parent && next_ctx &&
- rcu_dereference(next_ctx->parent_ctx) == parent) {
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent || !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
@@ -2074,6 +2351,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
+unlock:
rcu_read_unlock();
if (do_switch) {
@@ -2308,8 +2586,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
if (cpuctx->ctx.nr_branch_stack > 0
&& pmu->flush_branch_stack) {
- pmu = cpuctx->ctx.pmu;
-
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
@@ -2500,16 +2776,18 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ perf_pmu_disable(event->pmu);
+
hwc = &event->hw;
- if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+ if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
- continue;
+ goto next;
/*
* stop the event and update event->count
@@ -2531,6 +2809,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+ next:
+ perf_pmu_enable(event->pmu);
}
perf_pmu_enable(ctx->pmu);
@@ -2555,7 +2835,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
* because they're strictly cpu affine and rotate_start is called with IRQs
* disabled, while rotate_context is called from IRQ context.
*/
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
int rotate = 0, remove = 1;
@@ -2594,15 +2874,18 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
+
+ return rotate;
}
#ifdef CONFIG_NO_HZ_FULL
bool perf_event_can_stop_tick(void)
{
- if (list_empty(&__get_cpu_var(rotation_list)))
- return true;
- else
+ if (atomic_read(&nr_freq_events) ||
+ __this_cpu_read(perf_throttled_count))
return false;
+ else
+ return true;
}
#endif
@@ -2625,10 +2908,6 @@ void perf_event_task_tick(void)
ctx = cpuctx->task_ctx;
if (ctx)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- if (cpuctx->jiffies_interval == 1 ||
- !(jiffies % cpuctx->jiffies_interval))
- perf_rotate_context(cpuctx);
}
}
@@ -2696,6 +2975,22 @@ out:
local_irq_restore(flags);
}
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -2918,43 +3213,51 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
-static void free_event(struct perf_event *event)
+static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
- irq_work_sync(&event->pending);
-
- if (!event->parent) {
- if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_dec_deferred(&perf_sched_events);
- if (event->attr.mmap || event->attr.mmap_data)
- atomic_dec(&nr_mmap_events);
- if (event->attr.comm)
- atomic_dec(&nr_comm_events);
- if (event->attr.task)
- atomic_dec(&nr_task_events);
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- if (is_cgroup_event(event)) {
- atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
- static_key_slow_dec_deferred(&perf_sched_events);
- }
+ if (event->parent)
+ return;
- if (has_branch_stack(event)) {
- static_key_slow_dec_deferred(&perf_sched_events);
- /* is system-wide event */
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_dec(&per_cpu(perf_branch_stack_events,
- event->cpu));
- }
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
}
+ if (is_cgroup_event(event))
+ atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+}
- if (event->rb) {
- ring_buffer_put(event->rb);
- event->rb = NULL;
- }
+static void unaccount_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_dec(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_dec(&nr_comm_events);
+ if (event->attr.task)
+ atomic_dec(&nr_task_events);
+ if (event->attr.freq)
+ atomic_dec(&nr_freq_events);
if (is_cgroup_event(event))
- perf_detach_cgroup(event);
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (has_branch_stack(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+
+ unaccount_event_cpu(event, event->cpu);
+}
+
+static void __free_event(struct perf_event *event)
+{
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
if (event->destroy)
event->destroy(event);
@@ -2962,44 +3265,58 @@ static void free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->pmu)
+ module_put(event->pmu->module);
+
call_rcu(&event->rcu_head, free_event_rcu);
}
-int perf_event_release_kernel(struct perf_event *event)
+static void _free_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
+ irq_work_sync(&event->pending);
- WARN_ON_ONCE(ctx->parent_ctx);
- /*
- * There are two ways this annotation is useful:
- *
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
- *
- * 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
- */
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- raw_spin_unlock_irq(&ctx->lock);
- perf_remove_from_context(event);
- mutex_unlock(&ctx->mutex);
+ unaccount_event(event);
- free_event(event);
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
- return 0;
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+ __free_event(event);
+}
+
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
+{
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
+
+ _free_event(event);
}
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
static void put_event(struct perf_event *event)
{
+ struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
if (!atomic_long_dec_and_test(&event->refcount))
@@ -3038,9 +3355,33 @@ static void put_event(struct perf_event *event)
put_task_struct(owner);
}
- perf_event_release_kernel(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ perf_remove_from_context(event, true);
+ mutex_unlock(&ctx->mutex);
+
+ _free_event(event);
}
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
static int perf_release(struct inode *inode, struct file *file)
{
put_event(file->private_data);
@@ -3188,30 +3529,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
unsigned int events = POLL_HUP;
/*
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
- * grabs the rb reference but perf_event_set_output() overrides it.
- * Here is the timeline for two threads T1, T2:
- * t0: T1, rb = rcu_dereference(event->rb)
- * t1: T2, old_rb = event->rb
- * t2: T2, event->rb = new rb
- * t3: T2, ring_buffer_detach(old_rb)
- * t4: T1, ring_buffer_attach(rb1)
- * t5: T1, poll_wait(event->waitq)
- *
- * To avoid this problem, we grab mmap_mutex in perf_poll()
- * thereby ensuring that the assignment of the new ring buffer
- * and the detachment of the old buffer appear atomic to perf_poll()
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (rb) {
- ring_buffer_attach(event, rb);
+ rb = event->rb;
+ if (rb)
events = atomic_xchg(&rb->poll, 0);
- }
- rcu_read_unlock();
-
mutex_unlock(&event->mmap_mutex);
poll_wait(file, &event->waitq, wait);
@@ -3264,7 +3588,7 @@ static void perf_event_for_each(struct perf_event *event,
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
struct perf_event_context *ctx = event->ctx;
- int ret = 0;
+ int ret = 0, active;
u64 value;
if (!is_sampling_event(event))
@@ -3288,6 +3612,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->attr.sample_period = value;
event->hw.sample_period = value;
}
+
+ active = (event->state == PERF_EVENT_STATE_ACTIVE);
+ if (active) {
+ perf_pmu_disable(ctx->pmu);
+ event->pmu->stop(event, PERF_EF_UPDATE);
+ }
+
+ local64_set(&event->hw.period_left, 0);
+
+ if (active) {
+ event->pmu->start(event, PERF_EF_RELOAD);
+ perf_pmu_enable(ctx->pmu);
+ }
+
unlock:
raw_spin_unlock_irq(&ctx->lock);
@@ -3337,6 +3675,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
+ case PERF_EVENT_IOC_ID:
+ {
+ u64 id = primary_event_id(event);
+
+ if (copy_to_user((void __user *)arg, &id, sizeof(id)))
+ return -EFAULT;
+ return 0;
+ }
+
case PERF_EVENT_IOC_SET_OUTPUT:
{
int ret;
@@ -3418,6 +3765,26 @@ static void calc_timer_values(struct perf_event *event,
*running = ctx_time - event->tstamp_running;
}
+static void perf_event_init_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ userpg = rb->user_page;
+
+ /* Allow new userspace to detect that bit 0 is deprecated */
+ userpg->cap_bit0_is_deprecated = 1;
+ userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+
+unlock:
+ rcu_read_unlock();
+}
+
void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -3434,6 +3801,10 @@ void perf_event_update_userpage(struct perf_event *event)
u64 enabled, running, now;
rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
@@ -3444,12 +3815,8 @@ void perf_event_update_userpage(struct perf_event *event)
* NMI context
*/
calc_timer_values(event, &now, &enabled, &running);
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
userpg = rb->user_page;
-
/*
* Disable preemption so as to not let the corresponding user-space
* spin too long if we get preempted.
@@ -3515,32 +3882,47 @@ unlock:
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
+ struct ring_buffer *old_rb = NULL;
unsigned long flags;
- if (!list_empty(&event->rb_entry))
- return;
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
- spin_lock_irqsave(&rb->event_lock, flags);
- if (!list_empty(&event->rb_entry))
- goto unlock;
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
- list_add(&event->rb_entry, &rb->event_list);
-unlock:
- spin_unlock_irqrestore(&rb->event_lock, flags);
-}
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
-static void ring_buffer_detach(struct perf_event *event,
- struct ring_buffer *rb)
-{
- unsigned long flags;
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
- if (list_empty(&event->rb_entry))
- return;
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
- spin_lock_irqsave(&rb->event_lock, flags);
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
}
static void ring_buffer_wakeup(struct perf_event *event)
@@ -3549,13 +3931,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
- wake_up_all(&event->waitq);
-
-unlock:
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
rcu_read_unlock();
}
@@ -3584,18 +3963,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
- struct perf_event *event, *n;
- unsigned long flags;
-
if (!atomic_dec_and_test(&rb->refcount))
return;
- spin_lock_irqsave(&rb->event_lock, flags);
- list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- }
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3605,26 +3976,95 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
}
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->rb);
- struct user_struct *user = event->mmap_user;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = ring_buffer_get(event);
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ goto out_put;
+
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->pinned_vm -= event->mmap_locked;
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
mutex_unlock(&event->mmap_mutex);
+ put_event(event);
- ring_buffer_put(rb);
- free_uid(user);
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
}
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+out_put:
+ ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +4114,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages == nr_pages)
- atomic_inc(&event->rb->refcount);
- else
+ if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
goto unlock;
}
@@ -3720,13 +4172,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->rb, rb);
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
atomic_long_add(user_extra, &user->locked_vm);
- event->mmap_locked = extra;
- event->mmap_user = get_current_user();
- vma->vm_mm->pinned_vm += event->mmap_locked;
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+ perf_event_init_userpage(event);
perf_event_update_userpage(event);
unlock:
@@ -3734,7 +4190,11 @@ unlock:
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -3965,7 +4425,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_TIME)
data->time = perf_clock();
- if (sample_type & PERF_SAMPLE_ID)
+ if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -4004,6 +4464,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
}
void perf_event__output_id_sample(struct perf_event *event,
@@ -4069,7 +4532,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;
- if (sub != event)
+ if ((sub != event) &&
+ (sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
values[n++] = perf_event_count(sub);
@@ -4116,6 +4580,9 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, *header);
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+
if (sample_type & PERF_SAMPLE_IP)
perf_output_put(handle, data->ip);
@@ -4176,20 +4643,6 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
- if (!event->attr.watermark) {
- int wakeup_events = event->attr.wakeup_events;
-
- if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
- int events = local_inc_return(&rb->events);
-
- if (events >= wakeup_events) {
- local_sub(wakeup_events, &rb->events);
- local_inc(&rb->wakeup);
- }
- }
- }
-
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
if (data->br_stack) {
size_t size;
@@ -4225,16 +4678,34 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
- if (sample_type & PERF_SAMPLE_STACK_USER)
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
+ }
if (sample_type & PERF_SAMPLE_WEIGHT)
perf_output_put(handle, data->weight);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
+ if (!event->attr.watermark) {
+ int wakeup_events = event->attr.wakeup_events;
+
+ if (wakeup_events) {
+ struct ring_buffer *rb = handle->rb;
+ int events = local_inc_return(&rb->events);
+
+ if (events >= wakeup_events) {
+ local_sub(wakeup_events, &rb->events);
+ local_inc(&rb->wakeup);
+ }
+ }
+ }
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4394,12 +4865,10 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
-typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
static void
perf_event_aux_ctx(struct perf_event_context *ctx,
- perf_event_aux_match_cb match,
perf_event_aux_output_cb output,
void *data)
{
@@ -4410,15 +4879,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
continue;
if (!event_filter_match(event))
continue;
- if (match(event, data))
- output(event, data);
+ output(event, data);
}
}
static void
-perf_event_aux(perf_event_aux_match_cb match,
- perf_event_aux_output_cb output,
- void *data,
+perf_event_aux(perf_event_aux_output_cb output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_cpu_context *cpuctx;
@@ -4431,7 +4897,7 @@ perf_event_aux(perf_event_aux_match_cb match,
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
goto next;
- perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
+ perf_event_aux_ctx(&cpuctx->ctx, output, data);
if (task_ctx)
goto next;
ctxn = pmu->task_ctx_nr;
@@ -4439,14 +4905,14 @@ perf_event_aux(perf_event_aux_match_cb match,
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, match, output, data);
+ perf_event_aux_ctx(ctx, output, data);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
if (task_ctx) {
preempt_disable();
- perf_event_aux_ctx(task_ctx, match, output, data);
+ perf_event_aux_ctx(task_ctx, output, data);
preempt_enable();
}
rcu_read_unlock();
@@ -4455,7 +4921,7 @@ next:
/*
* task tracking -- fork/exit
*
- * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
*/
struct perf_task_event {
@@ -4473,6 +4939,13 @@ struct perf_task_event {
} event_id;
};
+static int perf_event_task_match(struct perf_event *event)
+{
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap2 || event->attr.mmap_data ||
+ event->attr.task;
+}
+
static void perf_event_task_output(struct perf_event *event,
void *data)
{
@@ -4482,6 +4955,9 @@ static void perf_event_task_output(struct perf_event *event,
struct task_struct *task = task_event->task;
int ret, size = task_event->event_id.header.size;
+ if (!perf_event_task_match(event))
+ return;
+
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
@@ -4504,13 +4980,6 @@ out:
task_event->event_id.header.size = size;
}
-static int perf_event_task_match(struct perf_event *event,
- void *data __maybe_unused)
-{
- return event->attr.comm || event->attr.mmap ||
- event->attr.mmap_data || event->attr.task;
-}
-
static void perf_event_task(struct task_struct *task,
struct perf_event_context *task_ctx,
int new)
@@ -4539,8 +5008,7 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_aux(perf_event_task_match,
- perf_event_task_output,
+ perf_event_aux(perf_event_task_output,
&task_event,
task_ctx);
}
@@ -4567,6 +5035,11 @@ struct perf_comm_event {
} event_id;
};
+static int perf_event_comm_match(struct perf_event *event)
+{
+ return event->attr.comm;
+}
+
static void perf_event_comm_output(struct perf_event *event,
void *data)
{
@@ -4576,6 +5049,9 @@ static void perf_event_comm_output(struct perf_event *event,
int size = comm_event->event_id.header.size;
int ret;
+ if (!perf_event_comm_match(event))
+ return;
+
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
comm_event->event_id.header.size);
@@ -4597,12 +5073,6 @@ out:
comm_event->event_id.header.size = size;
}
-static int perf_event_comm_match(struct perf_event *event,
- void *data __maybe_unused)
-{
- return event->attr.comm;
-}
-
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
char comm[TASK_COMM_LEN];
@@ -4617,27 +5087,14 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- perf_event_aux(perf_event_comm_match,
- perf_event_comm_output,
+ perf_event_aux(perf_event_comm_output,
comm_event,
NULL);
}
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
- struct perf_event_context *ctx;
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- perf_event_enable_on_exec(ctx);
- }
- rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -4649,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
- .misc = 0,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
@@ -4669,6 +5126,10 @@ struct perf_mmap_event {
const char *file_name;
int file_size;
+ int maj, min;
+ u64 ino;
+ u64 ino_generation;
+ u32 prot, flags;
struct {
struct perf_event_header header;
@@ -4681,6 +5142,17 @@ struct perf_mmap_event {
} event_id;
};
+static int perf_event_mmap_match(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
+
+ return (!executable && event->attr.mmap_data) ||
+ (executable && (event->attr.mmap || event->attr.mmap2));
+}
+
static void perf_event_mmap_output(struct perf_event *event,
void *data)
{
@@ -4690,6 +5162,19 @@ static void perf_event_mmap_output(struct perf_event *event,
int size = mmap_event->event_id.header.size;
int ret;
+ if (!perf_event_mmap_match(event, data))
+ return;
+
+ if (event->attr.mmap2) {
+ mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
+ mmap_event->event_id.header.size += sizeof(mmap_event->maj);
+ mmap_event->event_id.header.size += sizeof(mmap_event->min);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
+ }
+
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
mmap_event->event_id.header.size);
@@ -4700,6 +5185,16 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
+
+ if (event->attr.mmap2) {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
+ }
+
__output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
@@ -4710,82 +5205,114 @@ out:
mmap_event->event_id.header.size = size;
}
-static int perf_event_mmap_match(struct perf_event *event,
- void *data)
-{
- struct perf_mmap_event *mmap_event = data;
- struct vm_area_struct *vma = mmap_event->vma;
- int executable = vma->vm_flags & VM_EXEC;
-
- return (!executable && event->attr.mmap_data) ||
- (executable && event->attr.mmap);
-}
-
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
struct vm_area_struct *vma = mmap_event->vma;
struct file *file = vma->vm_file;
+ int maj = 0, min = 0;
+ u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
- const char *name;
-
- memset(tmp, 0, sizeof(tmp));
+ char *name;
if (file) {
+ struct inode *inode;
+ dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
/*
- * d_path works from the end of the rb backwards, so we
+ * d_path() works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
- if (!buf) {
- name = strncpy(tmp, "//enomem", sizeof(tmp));
- goto got_name;
- }
- name = d_path(&file->f_path, buf, PATH_MAX);
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
- name = strncpy(tmp, "//toolong", sizeof(tmp));
- goto got_name;
+ name = "//toolong";
+ goto cpy_name;
}
+ inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ gen = inode->i_generation;
+ maj = MAJOR(dev);
+ min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
+ goto got_name;
} else {
- if (arch_vma_name(mmap_event->vma)) {
- name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp) - 1);
- tmp[sizeof(tmp) - 1] = '\0';
- goto got_name;
- }
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
- if (!vma->vm_mm) {
- name = strncpy(tmp, "[vdso]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
- name = strncpy(tmp, "[heap]", sizeof(tmp));
- goto got_name;
- } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
- name = strncpy(tmp, "[stack]", sizeof(tmp));
- goto got_name;
+ name = "[stack]";
+ goto cpy_name;
}
- name = strncpy(tmp, "//anon", sizeof(tmp));
- goto got_name;
+ name = "//anon";
+ goto cpy_name;
}
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
got_name:
- size = ALIGN(strlen(name)+1, sizeof(u64));
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
mmap_event->file_name = name;
mmap_event->file_size = size;
+ mmap_event->maj = maj;
+ mmap_event->min = min;
+ mmap_event->ino = ino;
+ mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
if (!(vma->vm_flags & VM_EXEC))
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- perf_event_aux(perf_event_mmap_match,
- perf_event_mmap_output,
+ perf_event_aux(perf_event_mmap_output,
mmap_event,
NULL);
@@ -4815,6 +5342,12 @@ void perf_event_mmap(struct vm_area_struct *vma)
.len = vma->vm_end - vma->vm_start,
.pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
},
+ /* .maj (attr_mmap2 only) */
+ /* .min (attr_mmap2 only) */
+ /* .ino (attr_mmap2 only) */
+ /* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
@@ -4892,6 +5425,7 @@ static int __perf_event_overflow(struct perf_event *event,
__this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
+ tick_nohz_full_kick();
ret = 1;
}
}
@@ -4950,6 +5484,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -4961,7 +5498,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
* sign as trigger.
*/
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 period = hwc->last_period;
@@ -5196,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -5254,11 +5797,6 @@ static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
- if (event->cpu != -1) {
- swevent_hlist_put_cpu(event, event->cpu);
- return;
- }
-
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
@@ -5292,9 +5830,6 @@ static int swevent_hlist_get(struct perf_event *event)
int err;
int cpu, failed_cpu;
- if (event->cpu != -1)
- return swevent_hlist_get_cpu(event, event->cpu);
-
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
@@ -5846,7 +6381,7 @@ static int perf_event_idx_default(struct perf_event *event)
* Ensures all contexts with the same task_ctx_nr have the same
* pmu_cpu_context too.
*/
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
struct pmu *pmu;
@@ -5903,16 +6438,64 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
+static DEVICE_ATTR_RO(type);
+
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;
+
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;
+
+ if (timer < 1)
+ return -EINVAL;
+
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;
+
+ pmu->hrtimer_interval_ms = timer;
+
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }
+
+ return count;
+}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static struct device_attribute pmu_dev_attrs[] = {
- __ATTR_RO(type),
- __ATTR_NULL,
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
- .dev_attrs = pmu_dev_attrs,
+ .dev_groups = pmu_dev_groups,
};
static void pmu_dev_release(struct device *dev)
@@ -5952,7 +6535,7 @@ free_dev:
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
int cpu, ret;
@@ -6001,7 +6584,9 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
- cpuctx->jiffies_interval = 1;
+
+ __perf_cpu_hrtimer_init(cpuctx, cpu);
+
INIT_LIST_HEAD(&cpuctx->rotation_list);
cpuctx->unique_pmu = pmu;
}
@@ -6051,6 +6636,7 @@ free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
@@ -6072,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu)
put_device(pmu->dev);
free_pmu_context(pmu);
}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
struct pmu *perf_init_event(struct perf_event *event)
{
@@ -6085,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (ret)
@@ -6093,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event)
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (!ret)
@@ -6110,6 +6705,44 @@ unlock:
return pmu;
}
+static void account_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (has_branch_stack(event)) {
+ if (!(event->attach_state & PERF_ATTACH_TASK))
+ atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
+ }
+ if (is_cgroup_event(event))
+ atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void account_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_inc(&perf_sched_events.key);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_inc(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_inc(&nr_comm_events);
+ if (event->attr.task)
+ atomic_inc(&nr_task_events);
+ if (event->attr.freq) {
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_full_kick_all();
+ }
+ if (has_branch_stack(event))
+ static_key_slow_inc(&perf_sched_events.key);
+ if (is_cgroup_event(event))
+ static_key_slow_inc(&perf_sched_events.key);
+
+ account_event_cpu(event, event->cpu);
+}
+
/*
* Allocate and initialize a event structure
*/
@@ -6124,7 +6757,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
- long err;
+ long err = -EINVAL;
if ((unsigned)cpu >= nr_cpu_ids) {
if (!task || cpu != -1)
@@ -6149,6 +6782,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
+ INIT_LIST_HEAD(&event->active_entry);
+ INIT_HLIST_NODE(&event->hlist_entry);
+
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);
@@ -6207,49 +6843,36 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* we currently do not support PERF_FORMAT_GROUP on inherited events
*/
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
- goto done;
+ goto err_ns;
pmu = perf_init_event(event);
-
-done:
- err = 0;
if (!pmu)
- err = -EINVAL;
- else if (IS_ERR(pmu))
+ goto err_ns;
+ else if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
-
- if (err) {
- if (event->ns)
- put_pid_ns(event->ns);
- kfree(event);
- return ERR_PTR(err);
+ goto err_ns;
}
if (!event->parent) {
- if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_inc(&perf_sched_events.key);
- if (event->attr.mmap || event->attr.mmap_data)
- atomic_inc(&nr_mmap_events);
- if (event->attr.comm)
- atomic_inc(&nr_comm_events);
- if (event->attr.task)
- atomic_inc(&nr_task_events);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
- if (err) {
- free_event(event);
- return ERR_PTR(err);
- }
- }
- if (has_branch_stack(event)) {
- static_key_slow_inc(&perf_sched_events.key);
- if (!(event->attach_state & PERF_ATTACH_TASK))
- atomic_inc(&per_cpu(perf_branch_stack_events,
- event->cpu));
+ if (err)
+ goto err_pmu;
}
}
return event;
+
+err_pmu:
+ if (event->destroy)
+ event->destroy(event);
+ module_put(pmu->module);
+err_ns:
+ if (event->ns)
+ put_pid_ns(event->ns);
+ kfree(event);
+
+ return ERR_PTR(err);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -6327,11 +6950,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
return -EINVAL;
- /* kernel level capture: check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
-
/* propagate priv level, when not set for branch */
if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
@@ -6349,6 +6967,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
*/
attr->branch_sample_type = mask;
}
+ /* privileged levels capture (kernel, hv): check permissions */
+ if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -6384,7 +7006,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL, *old_rb = NULL;
+ struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -6419,16 +7041,12 @@ set:
goto unlock;
}
- old_rb = event->rb;
- rcu_assign_pointer(event->rb, rb);
- if (old_rb)
- ring_buffer_detach(event, old_rb);
+ ring_buffer_attach(event, rb);
+
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_rb)
- ring_buffer_put(old_rb);
out:
return ret;
}
@@ -6456,6 +7074,7 @@ SYSCALL_DEFINE5(perf_event_open,
int event_fd;
int move_group = 0;
int err;
+ int f_flags = O_RDWR;
/* for future expandability... */
if (flags & ~PERF_FLAG_ALL)
@@ -6473,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -6484,7 +7106,10 @@ SYSCALL_DEFINE5(perf_event_open,
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
return -EINVAL;
- event_fd = get_unused_fd();
+ if (flags & PERF_FLAG_FD_CLOEXEC)
+ f_flags |= O_CLOEXEC;
+
+ event_fd = get_unused_fd_flags(f_flags);
if (event_fd < 0)
return event_fd;
@@ -6507,28 +7132,38 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
get_online_cpus();
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_task;
+ goto err_cpus;
}
if (flags & PERF_FLAG_PID_CGROUP) {
err = perf_cgroup_connect(pid, event, &attr, group_leader);
- if (err)
+ if (err) {
+ __free_event(event);
+ goto err_cpus;
+ }
+ }
+
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
goto err_alloc;
- /*
- * one more event:
- * - that has cgroup constraint on event->cpu
- * - that may need work on context switch
- */
- atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
- static_key_slow_inc(&perf_sched_events.key);
+ }
}
+ account_event(event);
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -6609,7 +7244,8 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+ f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
goto err_context;
@@ -6619,7 +7255,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader, false);
/*
* Removing from the context ends up with disabled
@@ -6629,7 +7265,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling);
+ perf_remove_from_context(sibling, false);
perf_event__state_init(sibling);
put_ctx(gctx);
}
@@ -6652,7 +7288,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
perf_install_in_context(ctx, event, event->cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -6685,8 +7320,9 @@ err_context:
put_ctx(ctx);
err_alloc:
free_event(event);
-err_task:
+err_cpus:
put_online_cpus();
+err_task:
if (task)
put_task_struct(task);
err_group_fd:
@@ -6724,6 +7360,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err;
}
+ account_event(event);
+
ctx = find_get_context(event->pmu, task, cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -6733,7 +7371,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
- ++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -6759,19 +7396,21 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock(&src_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event);
+ perf_remove_from_context(event, false);
+ unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
- list_add(&event->event_entry, &events);
+ list_add(&event->migrate_entry, &events);
}
mutex_unlock(&src_ctx->mutex);
synchronize_rcu();
mutex_lock(&dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &events, event_entry) {
- list_del(&event->event_entry);
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
perf_install_in_context(dst_ctx, event, dst_cpu);
get_ctx(dst_ctx);
}
@@ -6819,13 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- if (child_event->parent) {
- raw_spin_lock_irq(&child_ctx->lock);
- perf_group_detach(child_event);
- raw_spin_unlock_irq(&child_ctx->lock);
- }
-
- perf_remove_from_context(child_event);
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -6840,8 +7485,8 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *tmp;
- struct perf_event_context *child_ctx;
+ struct perf_event *child_event, *next;
+ struct perf_event_context *child_ctx, *parent_ctx;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -6866,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@@ -6876,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
@@ -6894,24 +7555,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
mutex_lock(&child_ctx->mutex);
-again:
- list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
- group_entry)
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
- list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
- group_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
-
- /*
- * If the last event was a group event, it will have appended all
- * its siblings to the list, but we obtained 'tmp' before that which
- * will still point to the list head terminating the iteration.
- */
- if (!list_empty(&child_ctx->pinned_groups) ||
- !list_empty(&child_ctx->flexible_groups))
- goto again;
-
mutex_unlock(&child_ctx->mutex);
put_ctx(child_ctx);
@@ -7139,7 +7785,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/
- child_ctx = alloc_perf_context(event->pmu, child);
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -7176,6 +7822,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
* swapped under us.
*/
parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
/*
* No need to check if parent_ctx != NULL here; since we saw
@@ -7282,11 +7930,12 @@ static void __init perf_event_init_all_cpus(void)
}
}
-static void __cpuinit perf_event_init_cpu(int cpu)
+static void perf_event_init_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -7309,15 +7958,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
+ struct remove_event re = { .detach_group = false };
struct perf_event_context *ctx = __info;
- struct perf_event *event, *tmp;
perf_pmu_rotate_stop(ctx->pmu);
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_remove_from_context(event);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
- __perf_remove_from_context(event);
+ rcu_read_lock();
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
+ rcu_read_unlock();
}
static void perf_event_exit_cpu_context(int cpu)
@@ -7341,11 +7990,12 @@ static void perf_event_exit_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ perf_event_exit_cpu_context(cpu);
+
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
-
- perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
@@ -7371,7 +8021,7 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int __cpuinit
+static int
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
@@ -7387,7 +8037,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
-
default:
break;
}
@@ -7453,7 +8102,8 @@ unlock:
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *
+perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct perf_cgroup *jc;
@@ -7470,11 +8120,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
return &jc->css;
}
-static void perf_cgroup_css_free(struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct perf_cgroup *jc;
- jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
- struct perf_cgroup, css);
+ struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
+
free_percpu(jc->info);
kfree(jc);
}
@@ -7486,15 +8135,17 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, tset)
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+static void perf_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
struct task_struct *task)
{
/*
@@ -7508,9 +8159,7 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
task_function_call(task, __perf_cgroup_move, task);
}
-struct cgroup_subsys perf_subsys = {
- .name = "perf_event",
- .subsys_id = perf_subsys_id,
+struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1..1559fb0b929 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
#include <linux/smp.h>
#include <linux/hw_breakpoint.h>
-
-
/*
* Constraints data
*/
+struct bp_cpuinfo {
+ /* Number of pinned cpu breakpoints in a cpu */
+ unsigned int cpu_pinned;
+ /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+ unsigned int *tsk_pinned;
+ /* Number of non-pinned cpu/task breakpoints in a cpu */
+ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
-/* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
-
-/* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
-
-/* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
-
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
static int nr_slots[TYPE_MAX];
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+ return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
+
/* Keep track of the breakpoints attached to tasks */
static LIST_HEAD(bp_task_head);
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
int i;
- unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
for (i = nr_slots[type] - 1; i >= 0; i--) {
if (tsk_pinned[i] > 0)
@@ -120,13 +123,20 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
if (iter->hw.bp_target == tsk &&
find_slot_idx(iter) == type &&
- cpu == iter->cpu)
+ (iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
return count;
}
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+ if (bp->cpu >= 0)
+ return cpumask_of(bp->cpu);
+ return cpu_possible_mask;
+}
+
/*
* Report the number of pinned/un-pinned breakpoints we have in
* a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
enum bp_type_idx type)
{
- int cpu = bp->cpu;
- struct task_struct *tsk = bp->hw.bp_target;
-
- if (cpu >= 0) {
- slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
- if (!tsk)
- slots->pinned += max_task_bp_pinned(cpu, type);
- else
- slots->pinned += task_bp_pinned(cpu, bp, type);
- slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
-
- return;
- }
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
- for_each_online_cpu(cpu) {
- unsigned int nr;
+ for_each_cpu(cpu, cpumask) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+ int nr;
- nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
- if (!tsk)
+ nr = info->cpu_pinned;
+ if (!bp->hw.bp_target)
nr += max_task_bp_pinned(cpu, type);
else
nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
if (nr > slots->pinned)
slots->pinned = nr;
- nr = per_cpu(nr_bp_flexible[type], cpu);
-
+ nr = info->flexible;
if (nr > slots->flexible)
slots->flexible = nr;
}
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
/*
* Add a pinned breakpoint for the given task in our constraint table
*/
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
enum bp_type_idx type, int weight)
{
- unsigned int *tsk_pinned;
- int old_count = 0;
- int old_idx = 0;
- int idx = 0;
-
- old_count = task_bp_pinned(cpu, bp, type);
- old_idx = old_count - 1;
- idx = old_idx + weight;
-
- /* tsk_pinned[n] is the number of tasks having n breakpoints */
- tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
- if (enable) {
- tsk_pinned[idx]++;
- if (old_count > 0)
- tsk_pinned[old_idx]--;
- } else {
- tsk_pinned[idx]--;
- if (old_count > 0)
- tsk_pinned[old_idx]++;
- }
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int old_idx, new_idx;
+
+ old_idx = task_bp_pinned(cpu, bp, type) - 1;
+ new_idx = old_idx + weight;
+
+ if (old_idx >= 0)
+ tsk_pinned[old_idx]--;
+ if (new_idx >= 0)
+ tsk_pinned[new_idx]++;
}
/*
@@ -214,33 +203,26 @@ static void
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
int weight)
{
- int cpu = bp->cpu;
- struct task_struct *tsk = bp->hw.bp_target;
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
- /* Pinned counter cpu profiling */
- if (!tsk) {
+ if (!enable)
+ weight = -weight;
- if (enable)
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
- else
- per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
+ /* Pinned counter cpu profiling */
+ if (!bp->hw.bp_target) {
+ get_bp_info(bp->cpu, type)->cpu_pinned += weight;
return;
}
/* Pinned counter task profiling */
-
- if (!enable)
- list_del(&bp->hw.bp_list);
-
- if (cpu >= 0) {
- toggle_bp_task_slot(bp, cpu, enable, type, weight);
- } else {
- for_each_online_cpu(cpu)
- toggle_bp_task_slot(bp, cpu, enable, type, weight);
- }
+ for_each_cpu(cpu, cpumask)
+ toggle_bp_task_slot(bp, cpu, type, weight);
if (enable)
list_add_tail(&bp->hw.bp_list, &bp_task_head);
+ else
+ list_del(&bp->hw.bp_list);
}
/*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to a single cpu, check:
*
- * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
- * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
*
* -> If there are already non-pinned counters in this cpu, it means
* there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to every cpus, check:
*
- * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
- * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
*
* -> This is roughly the same, except we check the number of per cpu
* bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*
* - If attached to a single cpu, check:
*
- * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
- * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
*
- * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ * -> Same checks as before. But now the info->flexible, if any, must keep
* one register at least (or they will never be fed).
*
* - If attached to every cpus, check:
*
- * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
- * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
+ * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
static int __reserve_bp_slot(struct perf_event *bp)
{
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
void *context)
{
- struct perf_event * __percpu *cpu_events, **pevent, *bp;
- long err;
+ struct perf_event * __percpu *cpu_events, *bp;
+ long err = 0;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
get_online_cpus();
for_each_online_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
-
- *pevent = bp;
-
if (IS_ERR(bp)) {
err = PTR_ERR(bp);
- goto fail;
+ break;
}
- }
- put_online_cpus();
- return cpu_events;
-
-fail:
- for_each_online_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
- if (IS_ERR(*pevent))
- break;
- unregister_hw_breakpoint(*pevent);
+ per_cpu(*cpu_events, cpu) = bp;
}
put_online_cpus();
- free_percpu(cpu_events);
+ if (likely(!err))
+ return cpu_events;
+
+ unregister_wide_hw_breakpoint(cpu_events);
return (void __percpu __force *)ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
{
int cpu;
- struct perf_event **pevent;
- for_each_possible_cpu(cpu) {
- pevent = per_cpu_ptr(cpu_events, cpu);
- unregister_hw_breakpoint(*pevent);
- }
+ for_each_possible_cpu(cpu)
+ unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
+
free_percpu(cpu_events);
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
if (!(flags & PERF_EF_START))
bp->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(bp)) {
+ bp->hw.last_period = bp->hw.sample_period;
+ perf_swevent_set_period(bp);
+ }
+
return arch_install_hw_breakpoint(bp);
}
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- unsigned int **task_bp_pinned;
int cpu, err_cpu;
int i;
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
for_each_possible_cpu(cpu) {
for (i = 0; i < TYPE_MAX; i++) {
- task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
- *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
- GFP_KERNEL);
- if (!*task_bp_pinned)
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
+ GFP_KERNEL);
+ if (!info->tsk_pinned)
goto err_alloc;
}
}
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
for (i = 0; i < TYPE_MAX; i++)
- kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
+ kfree(get_bp_info(err_cpu, i)->tsk_pinned);
if (err_cpu == cpu)
break;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59d..569b218782a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,10 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
@@ -78,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
}
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned int \
+static inline unsigned long \
func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned int len) \
+ const void *buf, unsigned long len) \
{ \
unsigned long size, written; \
\
do { \
- size = min_t(unsigned long, handle->size, len); \
- \
+ size = min(handle->size, len); \
written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
\
len -= written; \
handle->addr += written; \
@@ -106,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
-static inline int memcpy_common(void *dst, const void *src, size_t n)
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
{
memcpy(dst, src, n);
- return n;
+ return 0;
}
DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
#ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
+}
#endif
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b..146a5792b1d 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/circ_buf.h>
#include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
- unsigned long offset, unsigned long head)
-{
- unsigned long sz = perf_data_size(rb);
- unsigned long mask = sz - 1;
-
- /*
- * check if user-writable
- * overwrite : over-write its own tail
- * !overwrite: buffer possibly drops events.
- */
- if (rb->overwrite)
- return true;
-
- /*
- * verify that payload is not bigger than buffer
- * otherwise masking logic may fail to detect
- * the "not enough space" condition
- */
- if ((head - offset) > sz)
- return false;
-
- offset = (offset - tail) & mask;
- head = (head - tail) & mask;
-
- if ((int)(head - offset) < 0)
- return false;
-
- return true;
-}
-
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,15 +57,37 @@ again:
goto out;
/*
- * Publish the known good head. Rely on the full barrier implied
- * by atomic_dec_and_test() order the rb->head read and this
- * write.
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * if (LOAD ->data_tail) { LOAD ->data_head
+ * (A) smp_rmb() (C)
+ * STORE $data LOAD $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head STORE ->data_tail
+ * }
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * In our case (A) is a control dependency that separates the load of
+ * the ->data_tail and the stores of $data. In case ->data_tail
+ * indicates there is no room in the buffer to store $data we do not.
+ *
+ * D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
*/
+ smp_wmb(); /* B, matches C */
rb->user_page->data_head = head;
/*
- * Now check if we missed an update, rely on the (compiler)
- * barrier in atomic_dec_and_test() to re-read rb->head.
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
@@ -114,8 +106,7 @@ int perf_output_begin(struct perf_output_handle *handle,
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
- int have_lost;
- struct perf_sample_data sample_data;
+ int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
@@ -130,55 +121,72 @@ int perf_output_begin(struct perf_output_handle *handle,
event = event->parent;
rb = rcu_dereference(event->rb);
- if (!rb)
+ if (unlikely(!rb))
goto out;
- handle->rb = rb;
- handle->event = event;
-
- if (!rb->nr_pages)
+ if (unlikely(!rb->nr_pages))
goto out;
+ handle->rb = rb;
+ handle->event = event;
+
have_lost = local_read(&rb->lost);
- if (have_lost) {
- lost_event.header.size = sizeof(lost_event);
- perf_event_header__init_id(&lost_event.header, &sample_data,
- event);
- size += lost_event.header.size;
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
}
perf_output_get_handle(handle);
do {
- /*
- * Userspace could choose to issue a mb() before updating the
- * tail pointer. So that all reads will be completed before the
- * write is issued.
- */
tail = ACCESS_ONCE(rb->user_page->data_tail);
- smp_rmb();
offset = head = local_read(&rb->head);
- head += size;
- if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
+
+ /*
+ * The above forms a control dependency barrier separating the
+ * @tail load above from the data stores below. Since the @tail
+ * load is required to compute the branch to fail below.
+ *
+ * A, matches D; the full memory barrier userspace SHOULD issue
+ * after reading the data and before storing the new tail
+ * position.
+ *
+ * See perf_output_put_handle().
+ */
+
+ head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
- if (head - local_read(&rb->wakeup) > rb->watermark)
+ /*
+ * We rely on the implied barrier() by local_cmpxchg() to ensure
+ * none of the data stores below can be lifted up by the compiler.
+ */
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
- handle->page = offset >> (PAGE_SHIFT + page_order(rb));
- handle->page &= rb->nr_pages - 1;
- handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
- handle->addr = rb->data_pages[handle->page];
- handle->addr += handle->size;
- handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+ page_shift = PAGE_SHIFT + page_order(rb);
- if (have_lost) {
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
+
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f3569747d62..6f3254e8c13 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,8 @@
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
#include <linux/uprobes.h>
@@ -59,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
-/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
@@ -72,6 +72,17 @@ struct uprobe {
struct inode *inode; /* Also hold a ref to inode */
loff_t offset;
unsigned long flags;
+
+ /*
+ * The generic code assumes that it has two members of unknown type
+ * owned by the arch-specific code:
+ *
+ * insn - copy_insn() saves the original instruction here for
+ * arch_uprobe_analyze_insn().
+ *
+ * ixol - potentially modified instruction to execute out of
+ * line, copied to xol_area by xol_get_insn_slot().
+ */
struct arch_uprobe arch;
};
@@ -85,6 +96,29 @@ struct return_instance {
};
/*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot. It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct page *page;
+
+ /*
+ * We keep the vma's vm_start rather than a pointer to the vma
+ * itself. The probed process or a naughty kernel module could make
+ * the vma go away, and we must handle that reasonably gracefully.
+ */
+ unsigned long vaddr; /* Page(s) of instruction slots */
+};
+
+/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
* changed after breakpoint was inserted.
@@ -94,7 +128,7 @@ struct return_instance {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
if (is_register)
flags |= VM_WRITE;
@@ -244,23 +278,18 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
* supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
- * have fixed length instructions.
- */
-
-/*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
+ *
+ * uprobe_write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm).
- *
- * For mm @mm, write the opcode at @vaddr.
+ * Called with mm->mmap_sem held for write.
* Return 0 (success) or a negative errno.
*/
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
@@ -277,21 +306,25 @@ retry:
if (ret <= 0)
goto put_old;
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
goto put_old;
- __SetPageUptodate(new_page);
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ goto put_new;
+ __SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_new;
-
ret = __replace_page(vma, vaddr, old_page, new_page);
+ if (ret)
+ mem_cgroup_uncharge_page(new_page);
put_new:
page_cache_release(new_page);
@@ -314,7 +347,7 @@ put_old:
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
}
/**
@@ -329,7 +362,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
}
static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -456,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->offset = offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
- /* For now assume that the instruction need not be single-stepped */
- __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
-
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
kfree(uprobe);
@@ -503,19 +533,19 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
return ret;
}
-static int
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
- unsigned long nbytes, loff_t offset)
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
{
struct page *page;
-
- if (!mapping->a_ops->readpage)
- return -EIO;
/*
- * Ensure that the page that has the original instruction is
- * populated and in page-cache.
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
*/
- page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -527,28 +557,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
- struct address_space *mapping;
- unsigned long nbytes;
- int bytes;
-
- nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
- mapping = uprobe->inode->i_mapping;
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = &uprobe->arch.insn;
+ int size = sizeof(uprobe->arch.insn);
+ int len, err = -EIO;
- /* Instruction at end of binary; copy only available bytes */
- if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
- bytes = uprobe->inode->i_size - uprobe->offset;
- else
- bytes = MAX_UINSN_BYTES;
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
- /* Instruction at the page-boundary; copy bytes in second page */
- if (nbytes < bytes) {
- int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
- bytes - nbytes, uprobe->offset + nbytes);
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
if (err)
- return err;
- bytes = nbytes;
- }
- return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
}
static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -569,14 +599,14 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
goto out;
ret = -ENOTSUPP;
- if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
if (ret)
goto out;
- /* write_opcode() assumes we don't cross page boundary */
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
BUG_ON((uprobe->offset & ~PAGE_MASK) +
UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -816,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
{
int err;
- if (!consumer_del(uprobe, uc)) /* WARN? */
+ if (WARN_ON(!consumer_del(uprobe, uc)))
return;
err = register_for_each_vma(uprobe, NULL);
@@ -851,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
@@ -894,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
- if (!uprobe)
+ if (WARN_ON(!uprobe))
return ret;
down_write(&uprobe->register_rwsem);
@@ -919,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset);
- if (!uprobe)
+ if (WARN_ON(!uprobe))
return;
down_write(&uprobe->register_rwsem);
@@ -1096,21 +1129,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
}
/* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- struct mm_struct *mm = current->mm;
int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
- ret = -ENOMEM;
- /* Try to map as high as possible, this is only a hint. */
- area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
- ret = area->vaddr;
- goto fail;
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
}
ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1154,19 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
- ret = 0;
fail:
up_write(&mm->mmap_sem);
return ret;
}
-/*
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
+static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- struct xol_area *area;
uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
- area = mm->uprobes_state.xol_area;
- if (area)
- goto ret;
-
- area = kzalloc(sizeof(*area), GFP_KERNEL);
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
@@ -1155,13 +1178,14 @@ static struct xol_area *get_xol_area(void)
if (!area->page)
goto free_bitmap;
- /* allocate first slot of task's xol_area for the return probes */
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
atomic_set(&area->slot_count, 1);
- init_waitqueue_head(&area->wq);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
- if (!xol_add_vma(area))
+ if (!xol_add_vma(mm, area))
return area;
__free_page(area->page);
@@ -1170,9 +1194,25 @@ static struct xol_area *get_xol_area(void)
free_area:
kfree(area);
out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
area = mm->uprobes_state.xol_area;
- ret:
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
return area;
}
@@ -1255,13 +1295,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- /* Initialize the slot */
- copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
- /*
- * We probably need flush_icache_user_range() but it needs vma.
- * This should work on supported architectures too.
- */
- flush_dcache_page(area->page);
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
}
@@ -1304,6 +1339,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
}
}
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+ void *src, unsigned long len)
+{
+ /* Initialize the slot */
+ copy_to_page(page, vaddr, src, len);
+
+ /*
+ * We probably need flush_icache_user_range() but it needs vma.
+ * This should work on most of architectures by default. If
+ * architecture needs to do something different it can define
+ * its own version of the function.
+ */
+ flush_dcache_page(page);
+}
+
/**
* uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
* @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1315,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.
@@ -1345,14 +1405,6 @@ void uprobe_free_utask(struct task_struct *t)
}
/*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
- t->utask = NULL;
-}
-
-/*
* Allocate a uprobe_task object for the task if if necessary.
* Called when the thread hits a breakpoint.
*
@@ -1367,6 +1419,82 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->dup_xol_addr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ t->utask->dup_xol_addr = area->vaddr;
+ init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+ task_work_add(t, &t->utask->dup_xol_work, true);
+}
+
/*
* Current area->vaddr notion assume the trampoline address is always
* equal area->vaddr.
@@ -1518,20 +1646,6 @@ bool uprobe_deny_signal(void)
return true;
}
-/*
- * Avoid singlestepping the original instruction if the original instruction
- * is a NOP or can be emulated.
- */
-static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
-{
- if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
- if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
- return true;
- clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
- }
- return false;
-}
-
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
struct vm_area_struct *vma;
@@ -1682,12 +1796,10 @@ static bool handle_trampoline(struct pt_regs *regs)
tmp = ri;
ri = ri->next;
kfree(tmp);
+ utask->depth--;
if (!chained)
break;
-
- utask->depth--;
-
BUG_ON(!ri);
}
@@ -1696,6 +1808,11 @@ static bool handle_trampoline(struct pt_regs *regs)
return true;
}
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1746,14 +1863,22 @@ static void handle_swbp(struct pt_regs *regs)
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
goto out;
+ /* Tracing handlers use ->utask to communicate with fetch methods */
+ if (!get_utask())
+ goto out;
+
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ goto out;
+
handler_chain(uprobe, regs);
- if (can_skip_sstep(uprobe, regs))
+
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- /* can_skip_sstep() succeeded, or restart if can't singlestep */
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@@ -1765,10 +1890,11 @@ out:
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
struct uprobe *uprobe;
+ int err = 0;
uprobe = utask->active_uprobe;
if (utask->state == UTASK_SSTEP_ACK)
- arch_uprobe_post_xol(&uprobe->arch, regs);
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
else if (utask->state == UTASK_SSTEP_TRAPPED)
arch_uprobe_abort_xol(&uprobe->arch, regs);
else
@@ -1782,6 +1908,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* see uprobe_deny_signal() */
spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
}
/*
@@ -1859,9 +1990,4 @@ static int __init init_uprobes(void)
return register_die_notifier(&uprobe_exception_nb);
}
-module_init(init_uprobes);
-
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
+__initcall(init_uprobes);