aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-03-04 11:47:50 +0100
committerIngo Molnar <mingo@elte.hu>2010-03-04 11:47:52 +0100
commit4f16d4e0c9a4b20d9f0db365587b96d6001efd7d (patch)
treefa25dcf285b26f1fac2bf267d0d1cd2c4eba90b8 /kernel
parent1e259e0a9982078896f3404240096cbea01daca4 (diff)
parent6630125419ef37ff8781713c5e9d416f2a4ba357 (diff)
Merge branch 'perf/core' into perf/urgent
Merge reason: Switch from pre-merge topical split to the post-merge urgent track Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/futex.c30
-rw-r--r--kernel/hw_breakpoint.c10
-rw-r--r--kernel/kfifo.c3
-rw-r--r--kernel/kgdb.c6
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/perf_event.c642
-rw-r--r--kernel/sched.c12
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/softlockup.c15
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c54
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c198
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_syscalls.c76
18 files changed, 666 insertions, 517 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9..e7a35f1039e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return -EINVAL;
WARN_ON(!atomic_read(&pi_state->refcount));
- WARN_ON(pid && pi_state->owner &&
- pi_state->owner->pid != pid);
+
+ /*
+ * When pi_state->owner is NULL then the owner died
+ * and another waiter is on the fly. pi_state->owner
+ * is fixed up by the task which acquires
+ * pi_state->rt_mutex.
+ *
+ * We do not check for pid == 0 which can happen when
+ * the owner died and robust_list_exit() cleared the
+ * TID.
+ */
+ if (pid && pi_state->owner) {
+ /*
+ * Bail out if user space manipulated the
+ * futex value.
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+ }
atomic_inc(&pi_state->refcount);
*ps = pi_state;
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
if (!pi_state)
return -EINVAL;
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ return -EINVAL;
+
raw_spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1971,7 +1995,7 @@ retry_private:
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- goto out;
+ goto out_put_key;
out_unlock_put_key:
queue_unlock(&q, hb);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 4d99512ee14..03808ed342a 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
*
* @return a set of per_cpu pointers to perf events
*/
-struct perf_event **
+struct perf_event * __percpu *
register_wide_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered)
{
- struct perf_event **cpu_events, **pevent, *bp;
+ struct perf_event * __percpu *cpu_events, **pevent, *bp;
long err;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
if (!cpu_events)
- return ERR_PTR(-ENOMEM);
+ return (void __percpu __force *)ERR_PTR(-ENOMEM);
get_online_cpus();
for_each_online_cpu(cpu) {
@@ -451,7 +451,7 @@ fail:
put_online_cpus();
free_percpu(cpu_events);
- return ERR_PTR(err);
+ return (void __percpu __force *)ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
* unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
* @cpu_events: the per cpu set of events to unregister
*/
-void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
{
int cpu;
struct perf_event **pevent;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 498cabba225..35edbe22e9a 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
buffer = kmalloc(size, gfp_mask);
if (!buffer) {
- _kfifo_init(fifo, 0, 0);
+ _kfifo_init(fifo, NULL, 0);
return -ENOMEM;
}
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
void kfifo_free(struct kfifo *fifo)
{
kfree(fifo->buffer);
+ _kfifo_init(fifo, NULL, 0);
}
EXPORT_SYMBOL(kfifo_free);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index c7ade62e4ef..761fdd2b303 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -599,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
/* Signal the primary CPU that we are done: */
atomic_set(&cpu_in_kgdb[cpu], 0);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sync();
clocksource_touch_watchdog();
local_irq_restore(flags);
}
@@ -1453,7 +1453,7 @@ acquirelock:
(kgdb_info[cpu].task &&
kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sync();
clocksource_touch_watchdog();
local_irq_restore(flags);
@@ -1553,7 +1553,7 @@ kgdb_restore:
}
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sync();
clocksource_touch_watchdog();
local_irq_restore(flags);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a020..ccec774c716 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
#include <linux/debugfs.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
+#include <linux/ftrace.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
@@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
{"native_get_debugreg",},
{"irq_entries_start",},
{"common_interrupt",},
+ {"mcount",}, /* mcount can be called from everywhere */
{NULL} /* Terminator */
};
@@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages);
static int kprobe_garbage_slots;
static int collect_garbage_slots(void);
-static int __kprobes check_safety(void)
-{
- int ret = 0;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
- ret = freeze_processes();
- if (ret == 0) {
- struct task_struct *p, *q;
- do_each_thread(p, q) {
- if (p != current && p->state == TASK_RUNNING &&
- p->pid != 0) {
- printk("Check failed: %s is running\n",p->comm);
- ret = -1;
- goto loop_end;
- }
- } while_each_thread(p, q);
- }
-loop_end:
- thaw_processes();
-#else
- synchronize_sched();
-#endif
- return ret;
-}
-
/**
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
@@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
{
struct kprobe_insn_page *kip, *next;
- /* Ensure no-one is preepmted on the garbages */
- if (check_safety())
- return -EAGAIN;
+ /* Ensure no-one is interrupted on the garbages */
+ synchronize_sched();
list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
int i;
@@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p)
preempt_disable();
if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr)) {
+ in_kprobes_functions((unsigned long) p->addr) ||
+ ftrace_text_reserved(p->addr, p->addr)) {
preempt_enable();
return -EINVAL;
}
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38..482d5e1d376 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly;
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
- return sysctl_perf_event_paranoid > -1;
-}
-
-static inline bool perf_paranoid_cpu(void)
-{
- return sysctl_perf_event_paranoid > 0;
-}
-
-static inline bool perf_paranoid_kernel(void)
-{
- return sysctl_perf_event_paranoid > 1;
-}
-
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
/*
@@ -98,11 +83,12 @@ void __weak hw_perf_enable(void) { barrier(); }
void __weak hw_perf_event_setup(int cpu) { barrier(); }
void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
+void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
int __weak
hw_perf_group_sched_in(struct perf_event *group_leader,
struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx, int cpu)
+ struct perf_event_context *ctx)
{
return 0;
}
@@ -248,7 +234,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
static inline u64 perf_clock(void)
{
- return cpu_clock(smp_processor_id());
+ return cpu_clock(raw_smp_processor_id());
}
/*
@@ -289,6 +275,15 @@ static void update_event_times(struct perf_event *event)
event->total_time_running = run_end - event->tstamp_running;
}
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+ if (event->attr.pinned)
+ return &ctx->pinned_groups;
+ else
+ return &ctx->flexible_groups;
+}
+
/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +298,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
* add it straight to the context's event list, or to the group
* leader's sibling list:
*/
- if (group_leader == event)
- list_add_tail(&event->group_entry, &ctx->group_list);
- else {
+ if (group_leader == event) {
+ struct list_head *list;
+
+ if (is_software_event(event))
+ event->group_flags |= PERF_GROUP_SOFTWARE;
+
+ list = ctx_group_list(event, ctx);
+ list_add_tail(&event->group_entry, list);
+ } else {
+ if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+ !is_software_event(event))
+ group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
}
@@ -355,9 +360,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* to the context list directly:
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+ struct list_head *list;
- list_move_tail(&sibling->group_entry, &ctx->group_list);
+ list = ctx_group_list(event, ctx);
+ list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;
+
+ /* Inherit group flags from the previous leader */
+ sibling->group_flags = event->group_flags;
}
}
@@ -608,14 +618,13 @@ void perf_event_disable(struct perf_event *event)
static int
event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- int cpu)
+ struct perf_event_context *ctx)
{
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ event->oncpu = smp_processor_id();
/*
* The new state must be visible before we turn it on in the hardware:
*/
@@ -642,8 +651,7 @@ event_sched_in(struct perf_event *event,
static int
group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- int cpu)
+ struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group;
int ret;
@@ -651,18 +659,18 @@ group_sched_in(struct perf_event *group_event,
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
- ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+ ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
if (ret)
return ret < 0 ? ret : 0;
- if (event_sched_in(group_event, cpuctx, ctx, cpu))
+ if (event_sched_in(group_event, cpuctx, ctx))
return -EAGAIN;
/*
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- if (event_sched_in(event, cpuctx, ctx, cpu)) {
+ if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
}
@@ -686,24 +694,6 @@ group_error:
}
/*
- * Return 1 for a group consisting entirely of software events,
- * 0 if the group contains any hardware events.
- */
-static int is_software_only_group(struct perf_event *leader)
-{
- struct perf_event *event;
-
- if (!is_software_event(leader))
- return 0;
-
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- if (!is_software_event(event))
- return 0;
-
- return 1;
-}
-
-/*
* Work out whether we can put this event group on the CPU now.
*/
static int group_can_go_on(struct perf_event *event,
@@ -713,7 +703,7 @@ static int group_can_go_on(struct perf_event *event,
/*
* Groups consisting entirely of software events can always go on.
*/
- if (is_software_only_group(event))
+ if (event->group_flags & PERF_GROUP_SOFTWARE)
return 1;
/*
* If an exclusive group is already on, no other hardware
@@ -754,7 +744,6 @@ static void __perf_install_in_context(void *info)
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
- int cpu = smp_processor_id();
int err;
/*
@@ -801,7 +790,7 @@ static void __perf_install_in_context(void *info)
if (!group_can_go_on(event, cpuctx, 1))
err = -EEXIST;
else
- err = event_sched_in(event, cpuctx, ctx, cpu);
+ err = event_sched_in(event, cpuctx, ctx);
if (err) {
/*
@@ -943,11 +932,9 @@ static void __perf_event_enable(void *info)
} else {
perf_disable();
if (event == leader)
- err = group_sched_in(event, cpuctx, ctx,
- smp_processor_id());
+ err = group_sched_in(event, cpuctx, ctx);
else
- err = event_sched_in(event, cpuctx, ctx,
- smp_processor_id());
+ err = event_sched_in(event, cpuctx, ctx);
perf_enable();
}
@@ -1043,8 +1030,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
return 0;
}
-void __perf_event_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
{
struct perf_event *event;
@@ -1055,10 +1049,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
update_context_time(ctx);
perf_disable();
- if (ctx->nr_active) {
- list_for_each_entry(event, &ctx->group_list, group_entry)
+ if (!ctx->nr_active)
+ goto out_enable;
+
+ if (event_type & EVENT_PINNED)
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
- }
+
+ if (event_type & EVENT_FLEXIBLE)
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+ group_sched_out(event, cpuctx, ctx);
+
+ out_enable:
perf_enable();
out:
raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1172,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
* not restart the event.
*/
void perf_event_task_sched_out(struct task_struct *task,
- struct task_struct *next, int cpu)
+ struct task_struct *next)
{
- struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent;
@@ -1220,15 +1222,13 @@ void perf_event_task_sched_out(struct task_struct *task,
rcu_read_unlock();
if (do_switch) {
- __perf_event_sched_out(ctx, cpuctx);
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
cpuctx->task_ctx = NULL;
}
}
-/*
- * Called with IRQs disabled
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -1238,47 +1238,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- __perf_event_sched_out(ctx, cpuctx);
+ ctx_sched_out(ctx, cpuctx, event_type);
cpuctx->task_ctx = NULL;
}
/*
* Called with IRQs disabled
*/
-static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+ task_ctx_sched_out(ctx, EVENT_ALL);
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
{
- __perf_event_sched_out(&cpuctx->ctx, cpuctx);
+ ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
static void
-__perf_event_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx, int cpu)
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
{
struct perf_event *event;
- int can_add_hw = 1;
-
- raw_spin_lock(&ctx->lock);
- ctx->is_active = 1;
- if (likely(!ctx->nr_events))
- goto out;
-
- ctx->timestamp = perf_clock();
-
- perf_disable();
- /*
- * First go through the list and put on any pinned groups
- * in order to give them the best chance of going on.
- */
- list_for_each_entry(event, &ctx->group_list, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF ||
- !event->attr.pinned)
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+ if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != cpu)
+ if (event->cpu != -1 && event->cpu != smp_processor_id())
continue;
if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx, cpu);
+ group_sched_in(event, cpuctx, ctx);
/*
* If this pinned group hasn't been scheduled,
@@ -1289,32 +1283,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
event->state = PERF_EVENT_STATE_ERROR;
}
}
+}
- list_for_each_entry(event, &ctx->group_list, group_entry) {
- /*
- * Ignore events in OFF or ERROR state, and
- * ignore pinned events since we did them already.
- */
- if (event->state <= PERF_EVENT_STATE_OFF ||
- event->attr.pinned)
- continue;
+static void
+ctx_flexible_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+ int can_add_hw = 1;
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+ /* Ignore events in OFF or ERROR state */
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ continue;
/*
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != cpu)
+ if (event->cpu != -1 && event->cpu != smp_processor_id())
continue;
if (group_can_go_on(event, cpuctx, can_add_hw))
- if (group_sched_in(event, cpuctx, ctx, cpu))
+ if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
}
+}
+
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ raw_spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ if (likely(!ctx->nr_events))
+ goto out;
+
+ ctx->timestamp = perf_clock();
+
+ perf_disable();
+
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
+ if (event_type & EVENT_PINNED)
+ ctx_pinned_sched_in(ctx, cpuctx);
+
+ /* Then walk through the lower prio flexible groups */
+ if (event_type & EVENT_FLEXIBLE)
+ ctx_flexible_sched_in(ctx, cpuctx);
+
perf_enable();
out:
raw_spin_unlock(&ctx->lock);
}
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ struct perf_event_context *ctx = &cpuctx->ctx;
+
+ ctx_sched_in(ctx, cpuctx, event_type);
+}
+
+static void task_ctx_sched_in(struct task_struct *task,
+ enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_event_context *ctx = task->perf_event_ctxp;
+
+ if (likely(!ctx))
+ return;
+ if (cpuctx->task_ctx == ctx)
+ return;
+ ctx_sched_in(ctx, cpuctx, event_type);
+ cpuctx->task_ctx = ctx;
+}
/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
@@ -1326,38 +1371,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
* accessing the event control register. If a NMI hits, then it will
* keep the event running.
*/
-void perf_event_task_sched_in(struct task_struct *task, int cpu)
+void perf_event_task_sched_in(struct task_struct *task)
{
- struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_event_context *ctx = task->perf_event_ctxp;
if (likely(!ctx))
return;
+
if (cpuctx->task_ctx == ctx)
return;
- __perf_event_sched_in(ctx, cpuctx, cpu);
+
+ /*
+ * We want to keep the following priority order:
+ * cpu pinned (that don't need to move), task pinned,
+ * cpu flexible, task flexible.
+ */
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+
cpuctx->task_ctx = ctx;
}
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
+ u64 frequency = event->attr.sample_freq;
+ u64 sec = NSEC_PER_SEC;
+ u64 divisor, dividend;
+
+ int count_fls, nsec_fls, frequency_fls, sec_fls;
+
+ count_fls = fls64(count);
+ nsec_fls = fls64(nsec);
+ frequency_fls = fls64(frequency);
+ sec_fls = 30;
+
+ /*
+ * We got @count in @nsec, with a target of sample_freq HZ
+ * the target period becomes:
+ *
+ * @count * 10^9
+ * period = -------------------
+ * @nsec * sample_freq
+ *
+ */
+
+ /*
+ * Reduce accuracy by one bit such that @a and @b converge
+ * to a similar magnitude.
+ */
+#define REDUCE_FLS(a, b) \
+do { \
+ if (a##_fls > b##_fls) { \
+ a >>= 1; \
+ a##_fls--; \
+ } else { \
+ b >>= 1; \
+ b##_fls--; \
+ } \
+} while (0)
+
+ /*
+ * Reduce accuracy until either term fits in a u64, then proceed with
+ * the other, so that finally we can do a u64/u64 division.
+ */
+ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+ REDUCE_FLS(nsec, frequency);
+ REDUCE_FLS(sec, count);
+ }
+
+ if (count_fls + sec_fls > 64) {
+ divisor = nsec * frequency;
+
+ while (count_fls + sec_fls > 64) {
+ REDUCE_FLS(count, sec);
+ divisor >>= 1;
+ }
- __perf_event_sched_in(ctx, cpuctx, cpu);
+ dividend = count * sec;
+ } else {
+ dividend = count * sec;
+
+ while (nsec_fls + frequency_fls > 64) {
+ REDUCE_FLS(nsec, frequency);
+ dividend >>= 1;
+ }
+
+ divisor = nsec * frequency;
+ }
+
+ return div64_u64(dividend, divisor);
}
-#define MAX_INTERRUPTS (~0ULL)
+static void perf_event_stop(struct perf_event *event)
+{
+ if (!event->pmu->stop)
+ return event->pmu->disable(event);
-static void perf_log_throttle(struct perf_event *event, int enable);
+ return event->pmu->stop(event);
+}
+
+static int perf_event_start(struct perf_event *event)
+{
+ if (!event->pmu->start)
+ return event->pmu->enable(event);
-static void perf_adjust_period(struct perf_event *event, u64 events)
+ return event->pmu->start(event);
+}
+
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
{
struct hw_perf_event *hwc = &event->hw;
u64 period, sample_period;
s64 delta;
- events *= hwc->sample_period;
- period = div64_u64(events, event->attr.sample_freq);
+ period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1503,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
sample_period = 1;
hwc->sample_period = sample_period;
+
+ if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+ perf_disable();
+ perf_event_stop(event);
+ atomic64_set(&hwc->period_left, 0);
+ perf_event_start(event);
+ perf_enable();
+ }
}
static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
{
struct perf_event *event;
struct hw_perf_event *hwc;
- u64 interrupts, freq;
+ u64 interrupts, now;
+ s64 delta;
raw_spin_lock(&ctx->lock);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1539,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
if (interrupts == MAX_INTERRUPTS) {
perf_log_throttle(event, 1);
event->pmu->unthrottle(event);
- interrupts = 2*sysctl_perf_event_sample_rate/HZ;
}
if (!event->attr.freq || !event->attr.sample_freq)
continue;
- /*
- * if the specified freq < HZ then we need to skip ticks
- */
- if (event->attr.sample_freq < HZ) {
- freq = event->attr.sample_freq;
-
- hwc->freq_count += freq;
- hwc->freq_interrupts += interrupts;
-
- if (hwc->freq_count < HZ)
- continue;
-
- interrupts = hwc->freq_interrupts;
- hwc->freq_interrupts = 0;
- hwc->freq_count -= HZ;
- } else
- freq = HZ;
-
- perf_adjust_period(event, freq * interrupts);
+ event->pmu->read(event);
+ now = atomic64_read(&event->count);
+ delta = now - hwc->freq_count_stamp;
+ hwc->freq_count_stamp = now;
- /*
- * In order to avoid being stalled by an (accidental) huge
- * sample period, force reset the sample period if we didn't
- * get any events in this freq period.
- */
- if (!interrupts) {
- perf_disable();
- event->pmu->disable(event);
- atomic64_set(&hwc->period_left, 0);
- event->pmu->enable(event);
- perf_enable();
- }
+ if (delta > 0)
+ perf_adjust_period(event, TICK_NSEC, delta);
}
raw_spin_unlock(&ctx->lock);
}
@@ -1442,26 +1560,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
*/
static void rotate_ctx(struct perf_event_context *ctx)
{
- struct perf_event *event;
-
if (!ctx->nr_events)
return;
raw_spin_lock(&ctx->lock);
- /*
- * Rotate the first entry last (works just fine for group events too):
- */
- perf_disable();
- list_for_each_entry(event, &ctx->group_list, group_entry) {
- list_move_tail(&event->group_entry, &ctx->group_list);
- break;
- }
- perf_enable();
+
+ /* Rotate the first entry last of non-pinned groups */
+ list_rotate_left(&ctx->flexible_groups);
raw_spin_unlock(&ctx->lock);
}
-void perf_event_task_tick(struct task_struct *curr, int cpu)
+void perf_event_task_tick(struct task_struct *curr)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
@@ -1469,24 +1579,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
if (!atomic_read(&nr_events))
return;
- cpuctx = &per_cpu(perf_cpu_context, cpu);
+ cpuctx = &__get_cpu_var(perf_cpu_context);
ctx = curr->perf_event_ctxp;
+ perf_disable();
+
perf_ctx_adjust_freq(&cpuctx->ctx);
if (ctx)
perf_ctx_adjust_freq(ctx);
- perf_event_cpu_sched_out(cpuctx);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx)
- __perf_event_task_sched_out(ctx);
+ task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx);
if (ctx)
rotate_ctx(ctx);
- perf_event_cpu_sched_in(cpuctx, cpu);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
if (ctx)
- perf_event_task_sched_in(curr, cpu);
+ task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+
+ perf_enable();
+}
+
+static int event_enable_on_exec(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ if (!event->attr.enable_on_exec)
+ return 0;
+
+ event->attr.enable_on_exec = 0;
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ return 0;
+
+ __perf_event_mark_enabled(event, ctx);
+
+ return 1;
}
/*
@@ -1499,6 +1628,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
struct perf_event *event;
unsigned long flags;
int enabled = 0;
+ int ret;
local_irq_save(flags);
ctx = task->perf_event_ctxp;
@@ -1509,14 +1639,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
raw_spin_lock(&ctx->lock);
- list_for_each_entry(event, &ctx->group_list, group_entry) {
- if (!event->attr.enable_on_exec)
- continue;
- event->attr.enable_on_exec = 0;
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- continue;
- __perf_event_mark_enabled(event, ctx);
- enabled = 1;
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+ ret = event_enable_on_exec(event, ctx);
+ if (ret)
+ enabled = 1;
+ }
+
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+ ret = event_enable_on_exec(event, ctx);
+