From 56962b4449af34070bb1994621ef4f0265eed4d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 30 Jun 2010 23:03:51 +0200
Subject: perf: Generalize some arch callchain code

- Most archs use one callchain buffer per cpu, except x86 that needs
  to deal with NMIs. Provide a default perf_callchain_buffer()
  implementation that x86 overrides.

- Centralize all the kernel/user regs handling and invoke new arch
  handlers from there: perf_callchain_user() / perf_callchain_kernel()
  That avoid all the user_mode(), current->mm checks and so...

- Invert some parameters in perf_callchain_*() helpers: entry to the
  left, regs to the right, following the traditional (dst, src).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Borislav Petkov <bp@amd64.org>
---
 kernel/perf_event.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c772a3d4000..02efde6c879 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2937,13 +2937,49 @@ void perf_event_do_pending(void)
 	__perf_pending_run();
 }
 
+DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
+
 /*
  * Callchain support -- arch specific
  */
 
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *perf_callchain_buffer(void)
 {
-	return NULL;
+	return &__get_cpu_var(perf_callchain_entry);
+}
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+				  struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+				struct pt_regs *regs)
+{
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	entry = perf_callchain_buffer();
+	if (!entry)
+		return NULL;
+
+	entry->nr = 0;
+
+	if (!user_mode(regs)) {
+		perf_callchain_kernel(entry, regs);
+		if (current->mm)
+			regs = task_pt_regs(current);
+		else
+			regs = NULL;
+	}
+
+	if (regs)
+		perf_callchain_user(entry, regs);
+
+	return entry;
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From f72c1a931e311bb7780fee19e41a89ac42cab50e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 1 Jul 2010 02:31:21 +0200
Subject: perf: Factorize callchain context handling

Store the kernel and user contexts from the generic layer instead
of archs, this gathers some repetitive code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/arm/kernel/perf_event.c         | 2 --
 arch/powerpc/kernel/perf_callchain.c | 3 ---
 arch/sh/kernel/perf_callchain.c      | 1 -
 arch/sparc/kernel/perf_event.c       | 3 ---
 arch/x86/kernel/cpu/perf_event.c     | 2 --
 kernel/perf_event.c                  | 5 ++++-
 6 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 0e3bbdb1592..64ca8c3ab94 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -3049,7 +3049,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	struct frame_tail *tail;
 
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 
 	tail = (struct frame_tail *)regs->ARM_fp - 1;
 
@@ -3076,7 +3075,6 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
-	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	fr.fp = regs->ARM_fp;
 	fr.sp = regs->ARM_sp;
 	fr.lr = regs->ARM_lr;
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
index f7a85ede840..d05ae4204bb 100644
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -57,7 +57,6 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	lr = regs->link;
 	sp = regs->gpr[1];
-	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	perf_callchain_store(entry, regs->nip);
 
 	if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
@@ -234,7 +233,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	next_ip = regs->nip;
 	lr = regs->link;
 	sp = regs->gpr[1];
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 	perf_callchain_store(entry, next_ip);
 
 	for (;;) {
@@ -435,7 +433,6 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	next_ip = regs->nip;
 	lr = regs->link;
 	sp = regs->gpr[1];
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 	perf_callchain_store(entry, next_ip);
 
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c
index ef076a91292..d5ca1ef50fa 100644
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -47,7 +47,6 @@ static const struct stacktrace_ops callchain_ops = {
 void
 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	perf_callchain_store(entry, regs->pc);
 
 	unwind_stack(NULL, regs, NULL, &callchain_ops, entry);
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 460162d74ab..4bc40293857 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1293,7 +1293,6 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 
 	stack_trace_flush();
 
-	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	perf_callchain_store(entry, regs->tpc);
 
 	ksp = regs->u_regs[UREG_I6];
@@ -1337,7 +1336,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 {
 	unsigned long ufp;
 
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 	perf_callchain_store(entry, regs->tpc);
 
 	ufp = regs->u_regs[UREG_I6] + STACK_BIAS;
@@ -1360,7 +1358,6 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 {
 	unsigned long ufp;
 
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 	perf_callchain_store(entry, regs->tpc);
 
 	ufp = regs->u_regs[UREG_I6] & 0xffffffffUL;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 39f8421b86e..a3c922288cc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1608,7 +1608,6 @@ static const struct stacktrace_ops backtrace_ops = {
 void
 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	perf_callchain_store(entry, regs->ip);
 
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
@@ -1660,7 +1659,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	fp = (void __user *)regs->bp;
 
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 	perf_callchain_store(entry, regs->ip);
 
 	if (perf_callchain_user32(regs, entry))
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 02efde6c879..615d024894c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2969,6 +2969,7 @@ static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	entry->nr = 0;
 
 	if (!user_mode(regs)) {
+		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 		perf_callchain_kernel(entry, regs);
 		if (current->mm)
 			regs = task_pt_regs(current);
@@ -2976,8 +2977,10 @@ static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 			regs = NULL;
 	}
 
-	if (regs)
+	if (regs) {
+		perf_callchain_store(entry, PERF_CONTEXT_USER);
 		perf_callchain_user(entry, regs);
+	}
 
 	return entry;
 }
-- 
cgit v1.2.3-70-g09d2


From 927c7a9e92c4f69097a6e9e086d11fc2f8a5b40b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 1 Jul 2010 16:20:36 +0200
Subject: perf: Fix race in callchains

Now that software events don't have interrupt disabled anymore in
the event path, callchains can nest on any context. So seperating
nmi and others contexts in two buffers has become racy.

Fix this by providing one buffer per nesting level. Given the size
of the callchain entries (2040 bytes * 4), we now need to allocate
them dynamically.

v2: Fixed put_callchain_entry call after recursion.
    Fix the type of the recursion, it must be an array.

v3: Use a manual pr cpu allocation (temporary solution until NMIs
    can safely access vmalloc'ed memory).
    Do a better separation between callchain reference tracking and
    allocation. Make the "put" path lockless for non-release cases.

v4: Protect the callchain buffers with rcu.

v5: Do the cpu buffers allocations node affine.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/x86/kernel/cpu/perf_event.c |  22 ++-
 include/linux/perf_event.h       |   1 -
 kernel/perf_event.c              | 298 ++++++++++++++++++++++++++++++---------
 3 files changed, 238 insertions(+), 83 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a3c922288cc..8e91cf34a9c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1608,6 +1608,11 @@ static const struct stacktrace_ops backtrace_ops = {
 void
 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
+
 	perf_callchain_store(entry, regs->ip);
 
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
@@ -1656,6 +1661,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	struct stack_frame frame;
 	const void __user *fp;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
 
 	fp = (void __user *)regs->bp;
 
@@ -1681,19 +1690,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	}
 }
 
-struct perf_callchain_entry *perf_callchain_buffer(void)
-{
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return NULL;
-	}
-
-	if (in_nmi())
-		return &__get_cpu_var(perf_callchain_entry_nmi);
-
-	return &__get_cpu_var(perf_callchain_entry);
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4db61dded38..d7e8ea69086 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -983,7 +983,6 @@ extern void perf_callchain_user(struct perf_callchain_entry *entry,
 				struct pt_regs *regs);
 extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
 				  struct pt_regs *regs);
-extern struct perf_callchain_entry *perf_callchain_buffer(void);
 
 
 static inline void
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 615d024894c..75ab8a2df6b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1763,6 +1763,216 @@ static u64 perf_event_read(struct perf_event *event)
 	return perf_event_count(event);
 }
 
+/*
+ * Callchain support
+ */
+
+struct callchain_cpus_entries {
+	struct rcu_head			rcu_head;
+	struct perf_callchain_entry	*cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[4]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+				  struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+				struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+	struct callchain_cpus_entries *entries;
+	int cpu;
+
+	entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+	for_each_possible_cpu(cpu)
+		kfree(entries->cpu_entries[cpu]);
+
+	kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+	struct callchain_cpus_entries *entries;
+
+	entries = callchain_cpus_entries;
+	rcu_assign_pointer(callchain_cpus_entries, NULL);
+	call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+	int cpu;
+	int size;
+	struct callchain_cpus_entries *entries;
+
+	/*
+	 * We can't use the percpu allocation API for data that can be
+	 * accessed from NMI. Use a temporary manual per cpu allocation
+	 * until that gets sorted out.
+	 */
+	size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+		num_possible_cpus();
+
+	entries = kzalloc(size, GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	size = sizeof(struct perf_callchain_entry) * 4;
+
+	for_each_possible_cpu(cpu) {
+		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+							 cpu_to_node(cpu));
+		if (!entries->cpu_entries[cpu])
+			goto fail;
+	}
+
+	rcu_assign_pointer(callchain_cpus_entries, entries);
+
+	return 0;
+
+fail:
+	for_each_possible_cpu(cpu)
+		kfree(entries->cpu_entries[cpu]);
+	kfree(entries);
+
+	return -ENOMEM;
+}
+
+static int get_callchain_buffers(void)
+{
+	int err = 0;
+	int count;
+
+	mutex_lock(&callchain_mutex);
+
+	count = atomic_inc_return(&nr_callchain_events);
+	if (WARN_ON_ONCE(count < 1)) {
+		err = -EINVAL;
+		goto exit;
+	}
+
+	if (count > 1) {
+		/* If the allocation failed, give up */
+		if (!callchain_cpus_entries)
+			err = -ENOMEM;
+		goto exit;
+	}
+
+	err = alloc_callchain_buffers();
+	if (err)
+		release_callchain_buffers();
+exit:
+	mutex_unlock(&callchain_mutex);
+
+	return err;
+}
+
+static void put_callchain_buffers(void)
+{
+	if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+		release_callchain_buffers();
+		mutex_unlock(&callchain_mutex);
+	}
+}
+
+static int get_recursion_context(int *recursion)
+{
+	int rctx;
+
+	if (in_nmi())
+		rctx = 3;
+	else if (in_irq())
+		rctx = 2;
+	else if (in_softirq())
+		rctx = 1;
+	else
+		rctx = 0;
+
+	if (recursion[rctx])
+		return -1;
+
+	recursion[rctx]++;
+	barrier();
+
+	return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+	barrier();
+	recursion[rctx]--;
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+	int cpu;
+	struct callchain_cpus_entries *entries;
+
+	*rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+	if (*rctx == -1)
+		return NULL;
+
+	entries = rcu_dereference(callchain_cpus_entries);
+	if (!entries)
+		return NULL;
+
+	cpu = smp_processor_id();
+
+	return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+	put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	int rctx;
+	struct perf_callchain_entry *entry;
+
+
+	entry = get_callchain_entry(&rctx);
+	if (rctx == -1)
+		return NULL;
+
+	if (!entry)
+		goto exit_put;
+
+	entry->nr = 0;
+
+	if (!user_mode(regs)) {
+		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+		perf_callchain_kernel(entry, regs);
+		if (current->mm)
+			regs = task_pt_regs(current);
+		else
+			regs = NULL;
+	}
+
+	if (regs) {
+		perf_callchain_store(entry, PERF_CONTEXT_USER);
+		perf_callchain_user(entry, regs);
+	}
+
+exit_put:
+	put_callchain_entry(rctx);
+
+	return entry;
+}
+
 /*
  * Initialize the perf_event context in a task_struct:
  */
@@ -1895,6 +2105,8 @@ static void free_event(struct perf_event *event)
 			atomic_dec(&nr_comm_events);
 		if (event->attr.task)
 			atomic_dec(&nr_task_events);
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			put_callchain_buffers();
 	}
 
 	if (event->buffer) {
@@ -2937,55 +3149,6 @@ void perf_event_do_pending(void)
 	__perf_pending_run();
 }
 
-DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain_buffer(void)
-{
-	return &__get_cpu_var(perf_callchain_entry);
-}
-
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-				  struct pt_regs *regs)
-{
-}
-
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-				struct pt_regs *regs)
-{
-}
-
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-	struct perf_callchain_entry *entry;
-
-	entry = perf_callchain_buffer();
-	if (!entry)
-		return NULL;
-
-	entry->nr = 0;
-
-	if (!user_mode(regs)) {
-		perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(entry, regs);
-		if (current->mm)
-			regs = task_pt_regs(current);
-		else
-			regs = NULL;
-	}
-
-	if (regs) {
-		perf_callchain_store(entry, PERF_CONTEXT_USER);
-		perf_callchain_user(entry, regs);
-	}
-
-	return entry;
-}
-
-
 /*
  * We assume there is only KVM supporting the callbacks.
  * Later on, we might change it to a list if there is
@@ -3480,14 +3643,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 
+	/* protect the callchain buffers */
+	rcu_read_lock();
+
 	perf_prepare_sample(&header, data, event, regs);
 
 	if (perf_output_begin(&handle, event, header.size, nmi, 1))
-		return;
+		goto exit;
 
 	perf_output_sample(&handle, &header, data, event);
 
 	perf_output_end(&handle);
+
+exit:
+	rcu_read_unlock();
 }
 
 /*
@@ -4243,32 +4412,16 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	int rctx;
 
-	if (in_nmi())
-		rctx = 3;
-	else if (in_irq())
-		rctx = 2;
-	else if (in_softirq())
-		rctx = 1;
-	else
-		rctx = 0;
-
-	if (cpuctx->recursion[rctx])
-		return -1;
-
-	cpuctx->recursion[rctx]++;
-	barrier();
-
-	return rctx;
+	return get_recursion_context(cpuctx->recursion);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
 void inline perf_swevent_put_recursion_context(int rctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	barrier();
-	cpuctx->recursion[rctx]--;
+
+	put_recursion_context(cpuctx->recursion, rctx);
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4968,6 +5121,13 @@ done:
 			atomic_inc(&nr_comm_events);
 		if (event->attr.task)
 			atomic_inc(&nr_task_events);
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+			err = get_callchain_buffers();
+			if (err) {
+				free_event(event);
+				return ERR_PTR(err);
+			}
+		}
 	}
 
 	return event;
-- 
cgit v1.2.3-70-g09d2


From 7ae07ea3a48d30689ee037cb136bc21f0b37d8ae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 14 Aug 2010 20:45:13 +0200
Subject: perf: Humanize the number of contexts

Instead of hardcoding the number of contexts for the recursions
barriers, define a cpp constant to make the code more
self-explanatory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
---
 include/linux/perf_event.h      | 14 ++++++++------
 kernel/perf_event.c             |  4 ++--
 kernel/trace/trace_event_perf.c |  8 ++++----
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d7e8ea69086..ae6fa605092 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -808,6 +808,12 @@ struct perf_event_context {
 	struct rcu_head			rcu_head;
 };
 
+/*
+ * Number of contexts where an event can trigger:
+ * 	task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS	4
+
 /**
  * struct perf_event_cpu_context - per cpu event context structure
  */
@@ -821,12 +827,8 @@ struct perf_cpu_context {
 	struct mutex			hlist_mutex;
 	int				hlist_refcount;
 
-	/*
-	 * Recursion avoidance:
-	 *
-	 * task, softirq, irq, nmi context
-	 */
-	int				recursion[4];
+	/* Recursion avoidance in each contexts */
+	int				recursion[PERF_NR_CONTEXTS];
 };
 
 struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 75ab8a2df6b..f416aef242c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1772,7 +1772,7 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
-static DEFINE_PER_CPU(int, callchain_recursion[4]);
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
 struct callchain_cpus_entries *callchain_cpus_entries;
@@ -1828,7 +1828,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * 4;
+	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 000e6e85b44..db2eae2efcf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include "trace.h"
 
-static char *perf_trace_buf[4];
+static char *perf_trace_buf[PERF_NR_CONTEXTS];
 
 /*
  * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -45,7 +45,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 		char *buf;
 		int i;
 
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			buf = (char *)alloc_percpu(perf_trace_t);
 			if (!buf)
 				goto fail;
@@ -65,7 +65,7 @@ fail:
 	if (!total_ref_count) {
 		int i;
 
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
 			perf_trace_buf[i] = NULL;
 		}
@@ -140,7 +140,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 	tp_event->perf_events = NULL;
 
 	if (!--total_ref_count) {
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
 			perf_trace_buf[i] = NULL;
 		}
-- 
cgit v1.2.3-70-g09d2


From fa66f07aa1f0950e1dc78b7ab39728b3f8aa77a1 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 26 Aug 2010 16:40:01 +0200
Subject: perf_events: Fix time tracking for events with pid != -1 and cpu !=
 -1

Per-thread events with a cpu filter, i.e., cpu != -1, were not
reporting correct timings when the thread never ran on the
monitored cpu. The time enabled was reported as a negative
value.

This patch fixes the problem by updating tstamp_stopped,
tstamp_running in event_sched_out() for events with filters and
which are marked as INACTIVE.

The function group_sched_out() is modified to systematically
call into event_sched_out() to avoid duplicating the timing
adjustment code twice.

With the patch, I now get:

$ task_cpu -i -e unhalted_core_cycles,unhalted_core_cycles
noploop 2 noploop for 2 seconds
CPU0 0		   unhalted_core_cycles (ena=1,991,136,594, run=0)
CPU0 0		   unhalted_core_cycles (ena=1,991,136,594, run=0)

CPU1 0		   unhalted_core_cycles (ena=1,991,136,594, run=0)
CPU1 0		   unhalted_core_cycles (ena=1,991,136,594, run=0)

CPU2 0		   unhalted_core_cycles (ena=1,991,136,594, run=0)
CPU2 0		   unhalted_core_cycles (ena=1,991,136,594, run=0)

CPU3 4,747,990,931 unhalted_core_cycles (ena=1,991,136,594, run=1,991,136,594)
CPU3 4,747,990,931 unhalted_core_cycles (ena=1,991,136,594, run=1,991,136,594)

Signed-off-by: Stephane Eranian <eranian@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: perfmon2-devel@lists.sf.net
Cc: eranian@google.com
LKML-Reference: <4c76802d.aae9d80a.115d.70fe@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 403d1804b19..657555a5f30 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -402,11 +402,31 @@ static void perf_group_detach(struct perf_event *event)
 	}
 }
 
+static inline int
+event_filter_match(struct perf_event *event)
+{
+	return event->cpu == -1 || event->cpu == smp_processor_id();
+}
+
 static void
 event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
+	u64 delta;
+	/*
+	 * An event which could not be activated because of
+	 * filter mismatch still needs to have its timings
+	 * maintained, otherwise bogus information is return
+	 * via read() for time_enabled, time_running:
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE
+	    && !event_filter_match(event)) {
+		delta = ctx->time - event->tstamp_stopped;
+		event->tstamp_running += delta;
+		event->tstamp_stopped = ctx->time;
+	}
+
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return;
 
@@ -432,9 +452,7 @@ group_sched_out(struct perf_event *group_event,
 		struct perf_event_context *ctx)
 {
 	struct perf_event *event;
-
-	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
-		return;
+	int state = group_event->state;
 
 	event_sched_out(group_event, cpuctx, ctx);
 
@@ -444,7 +462,7 @@ group_sched_out(struct perf_event *group_event,
 	list_for_each_entry(event, &group_event->sibling_list, group_entry)
 		event_sched_out(event, cpuctx, ctx);
 
-	if (group_event->attr.exclusive)
+	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5e11637e2c929e34dcc0fbbfb48bdb638937701a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 13:35:08 +0200
Subject: perf: Fix CPU hotplug

Since we have UP_PREPARE, we should also have UP_CANCELED.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 657555a5f30..db5b5606468 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5761,15 +5761,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
 		perf_event_init_cpu(cpu);
 		break;
 
+	case CPU_UP_CANCELED:
 	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
 		perf_event_exit_cpu(cpu);
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From 51b0fe39549a04858001922919ab355dee9bdfcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 13:35:57 +0200
Subject: perf: Deconstify struct pmu

sed -ie 's/const struct pmu\>/struct pmu/g' `git grep -l "const struct pmu\>"`

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c           |  4 ++--
 arch/arm/kernel/perf_event.c             |  2 +-
 arch/powerpc/kernel/perf_event.c         |  8 ++++----
 arch/powerpc/kernel/perf_event_fsl_emb.c |  2 +-
 arch/sh/kernel/perf_event.c              |  4 ++--
 arch/sparc/kernel/perf_event.c           | 10 +++++-----
 arch/x86/kernel/cpu/perf_event.c         | 14 +++++++-------
 include/linux/perf_event.h               | 10 +++++-----
 kernel/perf_event.c                      | 26 +++++++++++++-------------
 9 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 51c39fa4169..56fa4159038 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -642,7 +642,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	return 0;
 }
 
-static const struct pmu pmu = {
+static struct pmu pmu = {
 	.enable		= alpha_pmu_enable,
 	.disable	= alpha_pmu_disable,
 	.read		= alpha_pmu_read,
@@ -653,7 +653,7 @@ static const struct pmu pmu = {
 /*
  * Main entry point to initialise a HW performance event.
  */
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	int err;
 
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 64ca8c3ab94..0671e92c511 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -491,7 +491,7 @@ __hw_perf_event_init(struct perf_event *event)
 	return err;
 }
 
-const struct pmu *
+struct pmu *
 hw_perf_event_init(struct perf_event *event)
 {
 	int err = 0;
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index d301a30445e..5f78681ad90 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -857,7 +857,7 @@ static void power_pmu_unthrottle(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-void power_pmu_start_txn(const struct pmu *pmu)
+void power_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
@@ -870,7 +870,7 @@ void power_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-void power_pmu_cancel_txn(const struct pmu *pmu)
+void power_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
@@ -882,7 +882,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-int power_pmu_commit_txn(const struct pmu *pmu)
+int power_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	long i, n;
@@ -1014,7 +1014,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
 	return 0;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	u64 ev;
 	unsigned long flags;
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index 1ba45471ae4..d7619b5e7a6 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -428,7 +428,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
 	return 0;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	u64 ev;
 	struct perf_event *events[MAX_HWEVENTS];
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 7a3dc356725..395572c94c6 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -257,13 +257,13 @@ static void sh_pmu_read(struct perf_event *event)
 	sh_perf_event_update(event, &event->hw, event->hw.idx);
 }
 
-static const struct pmu pmu = {
+static struct pmu pmu = {
 	.enable		= sh_pmu_enable,
 	.disable	= sh_pmu_disable,
 	.read		= sh_pmu_read,
 };
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	int err = __hw_perf_event_init(event);
 	if (unlikely(err)) {
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 4bc40293857..481b894a501 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1099,7 +1099,7 @@ static int __hw_perf_event_init(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-static void sparc_pmu_start_txn(const struct pmu *pmu)
+static void sparc_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
@@ -1111,7 +1111,7 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-static void sparc_pmu_cancel_txn(const struct pmu *pmu)
+static void sparc_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
@@ -1123,7 +1123,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-static int sparc_pmu_commit_txn(const struct pmu *pmu)
+static int sparc_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int n;
@@ -1142,7 +1142,7 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
 	return 0;
 }
 
-static const struct pmu pmu = {
+static struct pmu pmu = {
 	.enable		= sparc_pmu_enable,
 	.disable	= sparc_pmu_disable,
 	.read		= sparc_pmu_read,
@@ -1152,7 +1152,7 @@ static const struct pmu pmu = {
 	.commit_txn	= sparc_pmu_commit_txn,
 };
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	int err = __hw_perf_event_init(event);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index de6569c04cd..fdd97f2e996 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -618,7 +618,7 @@ static void x86_pmu_enable_all(int added)
 	}
 }
 
-static const struct pmu pmu;
+static struct pmu pmu;
 
 static inline int is_x86_event(struct perf_event *event)
 {
@@ -1427,7 +1427,7 @@ static inline void x86_pmu_read(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -1440,7 +1440,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -1457,7 +1457,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int assign[X86_PMC_IDX_MAX];
@@ -1483,7 +1483,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	return 0;
 }
 
-static const struct pmu pmu = {
+static struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.start		= x86_pmu_start,
@@ -1569,9 +1569,9 @@ out:
 	return ret;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
-	const struct pmu *tmp;
+	struct pmu *tmp;
 	int err;
 
 	err = __hw_perf_event_init(event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000610c4de7..09d048b5211 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -578,19 +578,19 @@ struct pmu {
 	 * Start the transaction, after this ->enable() doesn't need
 	 * to do schedulability tests.
 	 */
-	void (*start_txn)	(const struct pmu *pmu);
+	void (*start_txn)	(struct pmu *pmu);
 	/*
 	 * If ->start_txn() disabled the ->enable() schedulability test
 	 * then ->commit_txn() is required to perform one. On success
 	 * the transaction is closed. On error the transaction is kept
 	 * open until ->cancel_txn() is called.
 	 */
-	int  (*commit_txn)	(const struct pmu *pmu);
+	int  (*commit_txn)	(struct pmu *pmu);
 	/*
 	 * Will cancel the transaction, assumes ->disable() is called for
 	 * each successfull ->enable() during the transaction.
 	 */
-	void (*cancel_txn)	(const struct pmu *pmu);
+	void (*cancel_txn)	(struct pmu *pmu);
 };
 
 /**
@@ -669,7 +669,7 @@ struct perf_event {
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
-	const struct pmu		*pmu;
+	struct pmu		*pmu;
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
@@ -849,7 +849,7 @@ struct perf_output_handle {
  */
 extern int perf_max_events;
 
-extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+extern struct pmu *hw_perf_event_init(struct perf_event *event);
 
 extern void perf_event_task_sched_in(struct task_struct *task);
 extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2d74f31220a..fb46fd13f31 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -75,7 +75,7 @@ static DEFINE_SPINLOCK(perf_resource_lock);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+extern __weak struct pmu *hw_perf_event_init(struct perf_event *event)
 {
 	return NULL;
 }
@@ -691,7 +691,7 @@ group_sched_in(struct perf_event *group_event,
 	       struct perf_event_context *ctx)
 {
 	struct perf_event *event, *partial_group = NULL;
-	const struct pmu *pmu = group_event->pmu;
+	struct pmu *pmu = group_event->pmu;
 	bool txn = false;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
@@ -4501,7 +4501,7 @@ static int perf_swevent_int(struct perf_event *event)
 	return 0;
 }
 
-static const struct pmu perf_ops_generic = {
+static struct pmu perf_ops_generic = {
 	.enable		= perf_swevent_enable,
 	.disable	= perf_swevent_disable,
 	.start		= perf_swevent_int,
@@ -4614,7 +4614,7 @@ static void cpu_clock_perf_event_read(struct perf_event *event)
 	cpu_clock_perf_event_update(event);
 }
 
-static const struct pmu perf_ops_cpu_clock = {
+static struct pmu perf_ops_cpu_clock = {
 	.enable		= cpu_clock_perf_event_enable,
 	.disable	= cpu_clock_perf_event_disable,
 	.read		= cpu_clock_perf_event_read,
@@ -4671,7 +4671,7 @@ static void task_clock_perf_event_read(struct perf_event *event)
 	task_clock_perf_event_update(event, time);
 }
 
-static const struct pmu perf_ops_task_clock = {
+static struct pmu perf_ops_task_clock = {
 	.enable		= task_clock_perf_event_enable,
 	.disable	= task_clock_perf_event_disable,
 	.read		= task_clock_perf_event_read,
@@ -4785,7 +4785,7 @@ static int swevent_hlist_get(struct perf_event *event)
 
 #ifdef CONFIG_EVENT_TRACING
 
-static const struct pmu perf_ops_tracepoint = {
+static struct pmu perf_ops_tracepoint = {
 	.enable		= perf_trace_enable,
 	.disable	= perf_trace_disable,
 	.start		= perf_swevent_int,
@@ -4849,7 +4849,7 @@ static void tp_perf_event_destroy(struct perf_event *event)
 	perf_trace_destroy(event);
 }
 
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static struct pmu *tp_perf_event_init(struct perf_event *event)
 {
 	int err;
 
@@ -4896,7 +4896,7 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static struct pmu *tp_perf_event_init(struct perf_event *event)
 {
 	return NULL;
 }
@@ -4918,7 +4918,7 @@ static void bp_perf_event_destroy(struct perf_event *event)
 	release_bp_slot(event);
 }
 
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+static struct pmu *bp_perf_event_init(struct perf_event *bp)
 {
 	int err;
 
@@ -4942,7 +4942,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
 		perf_swevent_add(bp, 1, 1, &sample, regs);
 }
 #else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+static struct pmu *bp_perf_event_init(struct perf_event *bp)
 {
 	return NULL;
 }
@@ -4964,9 +4964,9 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	swevent_hlist_put(event);
 }
 
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static struct pmu *sw_perf_event_init(struct perf_event *event)
 {
-	const struct pmu *pmu = NULL;
+	struct pmu *pmu = NULL;
 	u64 event_id = event->attr.config;
 
 	/*
@@ -5028,7 +5028,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		   perf_overflow_handler_t overflow_handler,
 		   gfp_t gfpflags)
 {
-	const struct pmu *pmu;
+	struct pmu *pmu;
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	long err;
-- 
cgit v1.2.3-70-g09d2


From b0a873ebbf87bf38bf70b5e39a7cadc96099fa13 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 13:35:08 +0200
Subject: perf: Register PMU implementations

Simple registration interface for struct pmu, this provides the
infrastructure for removing all the weak functions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c           |  37 +-
 arch/arm/kernel/perf_event.c             |  38 +-
 arch/powerpc/kernel/perf_event.c         |  46 +--
 arch/powerpc/kernel/perf_event_fsl_emb.c |  37 +-
 arch/sh/kernel/perf_event.c              |  35 +-
 arch/sparc/kernel/perf_event.c           |  29 +-
 arch/x86/kernel/cpu/perf_event.c         |  45 ++-
 include/linux/perf_event.h               |  10 +-
 kernel/hw_breakpoint.c                   |  35 +-
 kernel/perf_event.c                      | 588 +++++++++++++++----------------
 10 files changed, 488 insertions(+), 412 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 56fa4159038..19660b5c298 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -642,34 +642,39 @@ static int __hw_perf_event_init(struct perf_event *event)
 	return 0;
 }
 
-static struct pmu pmu = {
-	.enable		= alpha_pmu_enable,
-	.disable	= alpha_pmu_disable,
-	.read		= alpha_pmu_read,
-	.unthrottle	= alpha_pmu_unthrottle,
-};
-
-
 /*
  * Main entry point to initialise a HW performance event.
  */
-struct pmu *hw_perf_event_init(struct perf_event *event)
+static int alpha_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
 	if (!alpha_pmu)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	/* Do the real initialisation work. */
 	err = __hw_perf_event_init(event);
 
-	if (err)
-		return ERR_PTR(err);
-
-	return &pmu;
+	return err;
 }
 
-
+static struct pmu pmu = {
+	.event_init	= alpha_pmu_event_init,
+	.enable		= alpha_pmu_enable,
+	.disable	= alpha_pmu_disable,
+	.read		= alpha_pmu_read,
+	.unthrottle	= alpha_pmu_unthrottle,
+};
 
 /*
  * Main entry point - enable HW performance counters.
@@ -838,5 +843,7 @@ void __init init_hw_perf_events(void)
 	/* And set up PMU specification */
 	alpha_pmu = &ev67_pmu;
 	perf_max_events = alpha_pmu->num_pmcs;
+
+	perf_pmu_register(&pmu);
 }
 
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 0671e92c511..f62f9db35db 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -306,12 +306,7 @@ out:
 	return err;
 }
 
-static struct pmu pmu = {
-	.enable	    = armpmu_enable,
-	.disable    = armpmu_disable,
-	.unthrottle = armpmu_unthrottle,
-	.read	    = armpmu_read,
-};
+static struct pmu pmu;
 
 static int
 validate_event(struct cpu_hw_events *cpuc,
@@ -491,20 +486,29 @@ __hw_perf_event_init(struct perf_event *event)
 	return err;
 }
 
-struct pmu *
-hw_perf_event_init(struct perf_event *event)
+static int armpmu_event_init(struct perf_event *event)
 {
 	int err = 0;
 
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
 	if (!armpmu)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
 	event->destroy = hw_perf_event_destroy;
 
 	if (!atomic_inc_not_zero(&active_events)) {
 		if (atomic_read(&active_events) > perf_max_events) {
 			atomic_dec(&active_events);
-			return ERR_PTR(-ENOSPC);
+			return -ENOSPC;
 		}
 
 		mutex_lock(&pmu_reserve_mutex);
@@ -518,15 +522,23 @@ hw_perf_event_init(struct perf_event *event)
 	}
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = __hw_perf_event_init(event);
 	if (err)
 		hw_perf_event_destroy(event);
 
-	return err ? ERR_PTR(err) : &pmu;
+	return err;
 }
 
+static struct pmu pmu = {
+	.event_init = armpmu_event_init,
+	.enable	    = armpmu_enable,
+	.disable    = armpmu_disable,
+	.unthrottle = armpmu_unthrottle,
+	.read	    = armpmu_read,
+};
+
 void
 hw_perf_enable(void)
 {
@@ -2994,6 +3006,8 @@ init_hw_perf_events(void)
 		perf_max_events = -1;
 	}
 
+	perf_pmu_register(&pmu);
+
 	return 0;
 }
 arch_initcall(init_hw_perf_events);
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 5f78681ad90..19131b2614b 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -904,16 +904,6 @@ int power_pmu_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-struct pmu power_pmu = {
-	.enable		= power_pmu_enable,
-	.disable	= power_pmu_disable,
-	.read		= power_pmu_read,
-	.unthrottle	= power_pmu_unthrottle,
-	.start_txn	= power_pmu_start_txn,
-	.cancel_txn	= power_pmu_cancel_txn,
-	.commit_txn	= power_pmu_commit_txn,
-};
-
 /*
  * Return 1 if we might be able to put event on a limited PMC,
  * or 0 if not.
@@ -1014,7 +1004,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
 	return 0;
 }
 
-struct pmu *hw_perf_event_init(struct perf_event *event)
+static int power_pmu_event_init(struct perf_event *event)
 {
 	u64 ev;
 	unsigned long flags;
@@ -1026,25 +1016,27 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	struct cpu_hw_events *cpuhw;
 
 	if (!ppmu)
-		return ERR_PTR(-ENXIO);
+		return -ENOENT;
+
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		ev = event->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return ERR_PTR(-EOPNOTSUPP);
+			return -EOPNOTSUPP;
 		ev = ppmu->generic_events[ev];
 		break;
 	case PERF_TYPE_HW_CACHE:
 		err = hw_perf_cache_event(event->attr.config, &ev);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 		break;
 	case PERF_TYPE_RAW:
 		ev = event->attr.config;
 		break;
 	default:
-		return ERR_PTR(-EINVAL);
+		return -ENOENT;
 	}
+
 	event->hw.config_base = ev;
 	event->hw.idx = 0;
 
@@ -1081,7 +1073,7 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 			 */
 			ev = normal_pmc_alternative(ev, flags);
 			if (!ev)
-				return ERR_PTR(-EINVAL);
+				return -EINVAL;
 		}
 	}
 
@@ -1095,19 +1087,19 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 		n = collect_events(event->group_leader, ppmu->n_counter - 1,
 				   ctrs, events, cflags);
 		if (n < 0)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 	events[n] = ev;
 	ctrs[n] = event;
 	cflags[n] = flags;
 	if (check_excludes(ctrs, cflags, n, 1))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	cpuhw = &get_cpu_var(cpu_hw_events);
 	err = power_check_constraints(cpuhw, events, cflags, n + 1);
 	put_cpu_var(cpu_hw_events);
 	if (err)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	event->hw.config = events[n];
 	event->hw.event_base = cflags[n];
@@ -1132,11 +1124,20 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	}
 	event->destroy = hw_perf_event_destroy;
 
-	if (err)
-		return ERR_PTR(err);
-	return &power_pmu;
+	return err;
 }
 
+struct pmu power_pmu = {
+	.event_init	= power_pmu_event_init,
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
+	.unthrottle	= power_pmu_unthrottle,
+	.start_txn	= power_pmu_start_txn,
+	.cancel_txn	= power_pmu_cancel_txn,
+	.commit_txn	= power_pmu_commit_txn,
+};
+
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -1342,6 +1343,7 @@ int register_power_pmu(struct power_pmu *pmu)
 		freeze_events_kernel = MMCR0_FCHV;
 #endif /* CONFIG_PPC64 */
 
+	perf_pmu_register(&power_pmu);
 	perf_cpu_notifier(power_pmu_notifier);
 
 	return 0;
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index d7619b5e7a6..ea6a804e43f 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -378,13 +378,6 @@ static void fsl_emb_pmu_unthrottle(struct perf_event *event)
 	local_irq_restore(flags);
 }
 
-static struct pmu fsl_emb_pmu = {
-	.enable		= fsl_emb_pmu_enable,
-	.disable	= fsl_emb_pmu_disable,
-	.read		= fsl_emb_pmu_read,
-	.unthrottle	= fsl_emb_pmu_unthrottle,
-};
-
 /*
  * Release the PMU if this is the last perf_event.
  */
@@ -428,7 +421,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
 	return 0;
 }
 
-struct pmu *hw_perf_event_init(struct perf_event *event)
+static int fsl_emb_pmu_event_init(struct perf_event *event)
 {
 	u64 ev;
 	struct perf_event *events[MAX_HWEVENTS];
@@ -441,14 +434,14 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	case PERF_TYPE_HARDWARE:
 		ev = event->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return ERR_PTR(-EOPNOTSUPP);
+			return -EOPNOTSUPP;
 		ev = ppmu->generic_events[ev];
 		break;
 
 	case PERF_TYPE_HW_CACHE:
 		err = hw_perf_cache_event(event->attr.config, &ev);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 		break;
 
 	case PERF_TYPE_RAW:
@@ -456,12 +449,12 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 		break;
 
 	default:
-		return ERR_PTR(-EINVAL);
+		return -ENOENT;
 	}
 
 	event->hw.config = ppmu->xlate_event(ev);
 	if (!(event->hw.config & FSL_EMB_EVENT_VALID))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	/*
 	 * If this is in a group, check if it can go on with all the
@@ -473,7 +466,7 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 		n = collect_events(event->group_leader,
 		                   ppmu->n_counter - 1, events);
 		if (n < 0)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 
 	if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) {
@@ -484,7 +477,7 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 		}
 
 		if (num_restricted >= ppmu->n_restricted)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 
 	event->hw.idx = -1;
@@ -497,7 +490,7 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	if (event->attr.exclude_kernel)
 		event->hw.config_base |= PMLCA_FCS;
 	if (event->attr.exclude_idle)
-		return ERR_PTR(-ENOTSUPP);
+		return -ENOTSUPP;
 
 	event->hw.last_period = event->hw.sample_period;
 	local64_set(&event->hw.period_left, event->hw.last_period);
@@ -523,11 +516,17 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	}
 	event->destroy = hw_perf_event_destroy;
 
-	if (err)
-		return ERR_PTR(err);
-	return &fsl_emb_pmu;
+	return err;
 }
 
+static struct pmu fsl_emb_pmu = {
+	.event_init	= fsl_emb_pmu_event_init,
+	.enable		= fsl_emb_pmu_enable,
+	.disable	= fsl_emb_pmu_disable,
+	.read		= fsl_emb_pmu_read,
+	.unthrottle	= fsl_emb_pmu_unthrottle,
+};
+
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -651,5 +650,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
 	pr_info("%s performance monitor hardware support registered\n",
 		pmu->name);
 
+	perf_pmu_register(&fsl_emb_pmu);
+
 	return 0;
 }
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 395572c94c6..8cb206597e0 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -257,26 +257,38 @@ static void sh_pmu_read(struct perf_event *event)
 	sh_perf_event_update(event, &event->hw, event->hw.idx);
 }
 
-static struct pmu pmu = {
-	.enable		= sh_pmu_enable,
-	.disable	= sh_pmu_disable,
-	.read		= sh_pmu_read,
-};
-
-struct pmu *hw_perf_event_init(struct perf_event *event)
+static int sh_pmu_event_init(struct perf_event *event)
 {
-	int err = __hw_perf_event_init(event);
+	int err;
+
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HW_CACHE:
+	case PERF_TYPE_HARDWARE:
+		err = __hw_perf_event_init(event);
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
 	if (unlikely(err)) {
 		if (event->destroy)
 			event->destroy(event);
-		return ERR_PTR(err);
 	}
 
-	return &pmu;
+	return err;
 }
 
+static struct pmu pmu = {
+	.event_init	= sh_pmu_event_init,
+	.enable		= sh_pmu_enable,
+	.disable	= sh_pmu_disable,
+	.read		= sh_pmu_read,
+};
+
 static void sh_pmu_setup(int cpu)
-{
+
 	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
 
 	memset(cpuhw, 0, sizeof(struct cpu_hw_events));
@@ -325,6 +337,7 @@ int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
 
 	WARN_ON(pmu->num_events > MAX_HWEVENTS);
 
+	perf_pmu_register(&pmu);
 	perf_cpu_notifier(sh_pmu_notifier);
 	return 0;
 }
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 481b894a501..bed4327f5a7 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1025,7 +1025,7 @@ out:
 	return ret;
 }
 
-static int __hw_perf_event_init(struct perf_event *event)
+static int sparc_pmu_event_init(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
 	struct perf_event *evts[MAX_HWEVENTS];
@@ -1038,17 +1038,27 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if (atomic_read(&nmi_active) < 0)
 		return -ENODEV;
 
-	if (attr->type == PERF_TYPE_HARDWARE) {
+	switch (attr->type) {
+	case PERF_TYPE_HARDWARE:
 		if (attr->config >= sparc_pmu->max_events)
 			return -EINVAL;
 		pmap = sparc_pmu->event_map(attr->config);
-	} else if (attr->type == PERF_TYPE_HW_CACHE) {
+		break;
+
+	case PERF_TYPE_HW_CACHE:
 		pmap = sparc_map_cache_event(attr->config);
 		if (IS_ERR(pmap))
 			return PTR_ERR(pmap);
-	} else
+		break;
+
+	case PERF_TYPE_RAW:
 		return -EOPNOTSUPP;
 
+	default:
+		return -ENOENT;
+
+	}
+
 	/* We save the enable bits in the config_base.  */
 	hwc->config_base = sparc_pmu->irq_bit;
 	if (!attr->exclude_user)
@@ -1143,6 +1153,7 @@ static int sparc_pmu_commit_txn(struct pmu *pmu)
 }
 
 static struct pmu pmu = {
+	.event_init	= sparc_pmu_event_init,
 	.enable		= sparc_pmu_enable,
 	.disable	= sparc_pmu_disable,
 	.read		= sparc_pmu_read,
@@ -1152,15 +1163,6 @@ static struct pmu pmu = {
 	.commit_txn	= sparc_pmu_commit_txn,
 };
 
-struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-	int err = __hw_perf_event_init(event);
-
-	if (err)
-		return ERR_PTR(err);
-	return &pmu;
-}
-
 void perf_event_print_debug(void)
 {
 	unsigned long flags;
@@ -1280,6 +1282,7 @@ void __init init_hw_perf_events(void)
 	/* All sparc64 PMUs currently have 2 events.  */
 	perf_max_events = 2;
 
+	perf_pmu_register(&pmu);
 	register_die_notifier(&perf_event_nmi_notifier);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fdd97f2e996..2c89264ee79 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -530,7 +530,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 /*
  * Setup the hardware configuration for a given attr_type
  */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
@@ -1414,6 +1414,7 @@ void __init init_hw_perf_events(void)
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
+	perf_pmu_register(&pmu);
 	perf_cpu_notifier(x86_pmu_notifier);
 }
 
@@ -1483,18 +1484,6 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-static struct pmu pmu = {
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
-	.start		= x86_pmu_start,
-	.stop		= x86_pmu_stop,
-	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
-	.start_txn	= x86_pmu_start_txn,
-	.cancel_txn	= x86_pmu_cancel_txn,
-	.commit_txn	= x86_pmu_commit_txn,
-};
-
 /*
  * validate that we can schedule this event
  */
@@ -1569,12 +1558,22 @@ out:
 	return ret;
 }
 
-struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
 {
 	struct pmu *tmp;
 	int err;
 
-	err = __hw_perf_event_init(event);
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	err = __x86_pmu_event_init(event);
 	if (!err) {
 		/*
 		 * we temporarily connect event to its pmu
@@ -1594,12 +1593,24 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	if (err) {
 		if (event->destroy)
 			event->destroy(event);
-		return ERR_PTR(err);
 	}
 
-	return &pmu;
+	return err;
 }
 
+static struct pmu pmu = {
+	.event_init	= x86_pmu_event_init,
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.start		= x86_pmu_start,
+	.stop		= x86_pmu_stop,
+	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
+	.start_txn	= x86_pmu_start_txn,
+	.cancel_txn	= x86_pmu_cancel_txn,
+	.commit_txn	= x86_pmu_commit_txn,
+};
+
 /*
  * callchain support
  */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 09d048b5211..ab72f56eb37 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -561,6 +561,13 @@ struct perf_event;
  * struct pmu - generic performance monitoring unit
  */
 struct pmu {
+	struct list_head		entry;
+
+	/*
+	 * Should return -ENOENT when the @event doesn't match this pmu
+	 */
+	int (*event_init)		(struct perf_event *event);
+
 	int (*enable)			(struct perf_event *event);
 	void (*disable)			(struct perf_event *event);
 	int (*start)			(struct perf_event *event);
@@ -849,7 +856,8 @@ struct perf_output_handle {
  */
 extern int perf_max_events;
 
-extern struct pmu *hw_perf_event_init(struct perf_event *event);
+extern int perf_pmu_register(struct pmu *pmu);
+extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern void perf_event_task_sched_in(struct task_struct *task);
 extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index d71a987fd2b..e9c5cfa1fd2 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -565,6 +565,34 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 	.priority = 0x7fffffff
 };
 
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+	release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+	int err;
+
+	if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+		return -ENOENT;
+
+	err = register_perf_hw_breakpoint(bp);
+	if (err)
+		return err;
+
+	bp->destroy = bp_perf_event_destroy;
+
+	return 0;
+}
+
+static struct pmu perf_breakpoint = {
+	.event_init	= hw_breakpoint_event_init,
+	.enable		= arch_install_hw_breakpoint,
+	.disable	= arch_uninstall_hw_breakpoint,
+	.read		= hw_breakpoint_pmu_read,
+};
+
 static int __init init_hw_breakpoint(void)
 {
 	unsigned int **task_bp_pinned;
@@ -586,6 +614,8 @@ static int __init init_hw_breakpoint(void)
 
 	constraints_initialized = 1;
 
+	perf_pmu_register(&perf_breakpoint);
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
 
  err_alloc:
@@ -601,8 +631,3 @@ static int __init init_hw_breakpoint(void)
 core_initcall(init_hw_breakpoint);
 
 
-struct pmu perf_ops_bp = {
-	.enable		= arch_install_hw_breakpoint,
-	.disable	= arch_uninstall_hw_breakpoint,
-	.read		= hw_breakpoint_pmu_read,
-};
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fb46fd13f31..288ce43de57 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,7 +31,6 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -72,14 +71,6 @@ static atomic64_t perf_event_id;
  */
 static DEFINE_SPINLOCK(perf_resource_lock);
 
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-	return NULL;
-}
-
 void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
@@ -4501,182 +4492,6 @@ static int perf_swevent_int(struct perf_event *event)
 	return 0;
 }
 
-static struct pmu perf_ops_generic = {
-	.enable		= perf_swevent_enable,
-	.disable	= perf_swevent_disable,
-	.start		= perf_swevent_int,
-	.stop		= perf_swevent_void,
-	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_void, /* hwc->interrupts already reset */
-};
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-	enum hrtimer_restart ret = HRTIMER_RESTART;
-	struct perf_sample_data data;
-	struct pt_regs *regs;
-	struct perf_event *event;
-	u64 period;
-
-	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-	event->pmu->read(event);
-
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
-	regs = get_irq_regs();
-
-	if (regs && !perf_exclude_event(event, regs)) {
-		if (!(event->attr.exclude_idle && current->pid == 0))
-			if (perf_event_overflow(event, 0, &data, regs))
-				ret = HRTIMER_NORESTART;
-	}
-
-	period = max_t(u64, 10000, event->hw.sample_period);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-	return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swevent_hrtimer;
-	if (hwc->sample_period) {
-		u64 period;
-
-		if (hwc->remaining) {
-			if (hwc->remaining < 0)
-				period = 10000;
-			else
-				period = hwc->remaining;
-			hwc->remaining = 0;
-		} else {
-			period = max_t(u64, 10000, hwc->sample_period);
-		}
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->sample_period) {
-		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-		hwc->remaining = ktime_to_ns(remaining);
-
-		hrtimer_cancel(&hwc->hrtimer);
-	}
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-	int cpu = raw_smp_processor_id();
-	s64 prev;
-	u64 now;
-
-	now = cpu_clock(cpu);
-	prev = local64_xchg(&event->hw.prev_count, now);
-	local64_add(now - prev, &event->count);
-}
-
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int cpu = raw_smp_processor_id();
-
-	local64_set(&hwc->prev_count, cpu_clock(cpu));
-	perf_swevent_start_hrtimer(event);
-
-	return 0;
-}
-
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-	perf_swevent_cancel_hrtimer(event);
-	cpu_clock_perf_event_update(event);
-}
-
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-	cpu_clock_perf_event_update(event);
-}
-
-static struct pmu perf_ops_cpu_clock = {
-	.enable		= cpu_clock_perf_event_enable,
-	.disable	= cpu_clock_perf_event_disable,
-	.read		= cpu_clock_perf_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-	u64 prev;
-	s64 delta;
-
-	prev = local64_xchg(&event->hw.prev_count, now);
-	delta = now - prev;
-	local64_add(delta, &event->count);
-}
-
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 now;
-
-	now = event->ctx->time;
-
-	local64_set(&hwc->prev_count, now);
-
-	perf_swevent_start_hrtimer(event);
-
-	return 0;
-}
-
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-	perf_swevent_cancel_hrtimer(event);
-	task_clock_perf_event_update(event, event->ctx->time);
-
-}
-
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-	u64 time;
-
-	if (!in_nmi()) {
-		update_context_time(event->ctx);
-		time = event->ctx->time;
-	} else {
-		u64 now = perf_clock();
-		u64 delta = now - event->ctx->timestamp;
-		time = event->ctx->time + delta;
-	}
-
-	task_clock_perf_event_update(event, time);
-}
-
-static struct pmu perf_ops_task_clock = {
-	.enable		= task_clock_perf_event_enable,
-	.disable	= task_clock_perf_event_disable,
-	.read		= task_clock_perf_event_read,
-};
-
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
 swevent_hlist_deref(struct perf_cpu_context *cpuctx)
@@ -4783,17 +4598,63 @@ static int swevent_hlist_get(struct perf_event *event)
 	return err;
 }
 
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
-static struct pmu perf_ops_tracepoint = {
-	.enable		= perf_trace_enable,
-	.disable	= perf_trace_disable,
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+	u64 event_id = event->attr.config;
+
+	WARN_ON(event->parent);
+
+	atomic_dec(&perf_swevent_enabled[event_id]);
+	swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+	int event_id = event->attr.config;
+
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	switch (event_id) {
+	case PERF_COUNT_SW_CPU_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
+		return -ENOENT;
+
+	default:
+		break;
+	}
+
+	if (event_id > PERF_COUNT_SW_MAX)
+		return -ENOENT;
+
+	if (!event->parent) {
+		int err;
+
+		err = swevent_hlist_get(event);
+		if (err)
+			return err;
+
+		atomic_inc(&perf_swevent_enabled[event_id]);
+		event->destroy = sw_perf_event_destroy;
+	}
+
+	return 0;
+}
+
+static struct pmu perf_swevent = {
+	.event_init	= perf_swevent_init,
+	.enable		= perf_swevent_enable,
+	.disable	= perf_swevent_disable,
 	.start		= perf_swevent_int,
 	.stop		= perf_swevent_void,
 	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_void,
+	.unthrottle	= perf_swevent_void, /* hwc->interrupts already reset */
 };
 
+#ifdef CONFIG_EVENT_TRACING
+
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
@@ -4849,10 +4710,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
 	perf_trace_destroy(event);
 }
 
-static struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
 {
 	int err;
 
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -ENOENT;
+
 	/*
 	 * Raw tracepoint data is a severe data leak, only allow root to
 	 * have these.
@@ -4860,15 +4724,30 @@ static struct pmu *tp_perf_event_init(struct perf_event *event)
 	if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
 			perf_paranoid_tracepoint_raw() &&
 			!capable(CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
+		return -EPERM;
 
 	err = perf_trace_init(event);
 	if (err)
-		return NULL;
+		return err;
 
 	event->destroy = tp_perf_event_destroy;
 
-	return &perf_ops_tracepoint;
+	return 0;
+}
+
+static struct pmu perf_tracepoint = {
+	.event_init	= perf_tp_event_init,
+	.enable		= perf_trace_enable,
+	.disable	= perf_trace_disable,
+	.start		= perf_swevent_int,
+	.stop		= perf_swevent_void,
+	.read		= perf_swevent_read,
+	.unthrottle	= perf_swevent_void,
+};
+
+static inline void perf_tp_register(void)
+{
+	perf_pmu_register(&perf_tracepoint);
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4896,9 +4775,8 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
 {
-	return NULL;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4913,105 +4791,247 @@ static void perf_event_free_filter(struct perf_event *event)
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-	release_bp_slot(event);
+	struct perf_sample_data sample;
+	struct pt_regs *regs = data;
+
+	perf_sample_data_init(&sample, bp->attr.bp_addr);
+
+	if (!perf_exclude_event(bp, regs))
+		perf_swevent_add(bp, 1, 1, &sample, regs);
 }
+#endif
+
+/*
+ * hrtimer based swevent callback
+ */
 
-static struct pmu *bp_perf_event_init(struct perf_event *bp)
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-	int err;
+	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
+	struct pt_regs *regs;
+	struct perf_event *event;
+	u64 period;
 
-	err = register_perf_hw_breakpoint(bp);
-	if (err)
-		return ERR_PTR(err);
+	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+	event->pmu->read(event);
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+	regs = get_irq_regs();
+
+	if (regs && !perf_exclude_event(event, regs)) {
+		if (!(event->attr.exclude_idle && current->pid == 0))
+			if (perf_event_overflow(event, 0, &data, regs))
+				ret = HRTIMER_NORESTART;
+	}
 
-	bp->destroy = bp_perf_event_destroy;
+	period = max_t(u64, 10000, event->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
-	return &perf_ops_bp;
+	return ret;
 }
 
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
-	struct perf_sample_data sample;
-	struct pt_regs *regs = data;
+	struct hw_perf_event *hwc = &event->hw;
 
-	perf_sample_data_init(&sample, bp->attr.bp_addr);
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		u64 period;
 
-	if (!perf_exclude_event(bp, regs))
-		perf_swevent_add(bp, 1, 1, &sample, regs);
+		if (hwc->remaining) {
+			if (hwc->remaining < 0)
+				period = 10000;
+			else
+				period = hwc->remaining;
+			hwc->remaining = 0;
+		} else {
+			period = max_t(u64, 10000, hwc->sample_period);
+		}
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
 }
-#else
-static struct pmu *bp_perf_event_init(struct perf_event *bp)
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 {
-	return NULL;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->sample_period) {
+		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+		hwc->remaining = ktime_to_ns(remaining);
+
+		hrtimer_cancel(&hwc->hrtimer);
+	}
 }
 
-void perf_bp_event(struct perf_event *bp, void *regs)
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
 {
+	int cpu = raw_smp_processor_id();
+	s64 prev;
+	u64 now;
+
+	now = cpu_clock(cpu);
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
 }
-#endif
 
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static int cpu_clock_event_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int cpu = raw_smp_processor_id();
 
-static void sw_perf_event_destroy(struct perf_event *event)
+	local64_set(&hwc->prev_count, cpu_clock(cpu));
+	perf_swevent_start_hrtimer(event);
+
+	return 0;
+}
+
+static void cpu_clock_event_disable(struct perf_event *event)
 {
-	u64 event_id = event->attr.config;
+	perf_swevent_cancel_hrtimer(event);
+	cpu_clock_event_update(event);
+}
 
-	WARN_ON(event->parent);
+static void cpu_clock_event_read(struct perf_event *event)
+{
+	cpu_clock_event_update(event);
+}
 
-	atomic_dec(&perf_swevent_enabled[event_id]);
-	swevent_hlist_put(event);
+static int cpu_clock_event_init(struct perf_event *event)
+{
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+		return -ENOENT;
+
+	return 0;
 }
 
-static struct pmu *sw_perf_event_init(struct perf_event *event)
+static struct pmu perf_cpu_clock = {
+	.event_init	= cpu_clock_event_init,
+	.enable		= cpu_clock_event_enable,
+	.disable	= cpu_clock_event_disable,
+	.read		= cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
 {
-	struct pmu *pmu = NULL;
-	u64 event_id = event->attr.config;
+	u64 prev;
+	s64 delta;
 
-	/*
-	 * Software events (currently) can't in general distinguish
-	 * between user, kernel and hypervisor events.
-	 * However, context switches and cpu migrations are considered
-	 * to be kernel events, and page faults are never hypervisor
-	 * events.
-	 */
-	switch (event_id) {
-	case PERF_COUNT_SW_CPU_CLOCK:
-		pmu = &perf_ops_cpu_clock;
+	prev = local64_xchg(&event->hw.prev_count, now);
+	delta = now - prev;
+	local64_add(delta, &event->count);
+}
 
-		break;
-	case PERF_COUNT_SW_TASK_CLOCK:
-		/*
-		 * If the user instantiates this as a per-cpu event,
-		 * use the cpu_clock event instead.
-		 */
-		if (event->ctx->task)
-			pmu = &perf_ops_task_clock;
-		else
-			pmu = &perf_ops_cpu_clock;
+static int task_clock_event_enable(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 now;
 
-		break;
-	case PERF_COUNT_SW_PAGE_FAULTS:
-	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_SW_CONTEXT_SWITCHES:
-	case PERF_COUNT_SW_CPU_MIGRATIONS:
-	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-	case PERF_COUNT_SW_EMULATION_FAULTS:
-		if (!event->parent) {
-			int err;
-
-			err = swevent_hlist_get(event);
-			if (err)
-				return ERR_PTR(err);
+	now = event->ctx->time;
 
-			atomic_inc(&perf_swevent_enabled[event_id]);
-			event->destroy = sw_perf_event_destroy;
+	local64_set(&hwc->prev_count, now);
+
+	perf_swevent_start_hrtimer(event);
+
+	return 0;
+}
+
+static void task_clock_event_disable(struct perf_event *event)
+{
+	perf_swevent_cancel_hrtimer(event);
+	task_clock_event_update(event, event->ctx->time);
+
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+	u64 time;
+
+	if (!in_nmi()) {
+		update_context_time(event->ctx);
+		time = event->ctx->time;
+	} else {
+		u64 now = perf_clock();
+		u64 delta = now - event->ctx->timestamp;
+		time = event->ctx->time + delta;
+	}
+
+	task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+	if (event->attr.type != PERF_TYPE_SOFTWARE)
+		return -ENOENT;
+
+	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+		return -ENOENT;
+
+	return 0;
+}
+
+static struct pmu perf_task_clock = {
+	.event_init	= task_clock_event_init,
+	.enable		= task_clock_event_enable,
+	.disable	= task_clock_event_disable,
+	.read		= task_clock_event_read,
+};
+
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
+int perf_pmu_register(struct pmu *pmu)
+{
+	mutex_lock(&pmus_lock);
+	list_add_rcu(&pmu->entry, &pmus);
+	mutex_unlock(&pmus_lock);
+
+	return 0;
+}
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+	mutex_lock(&pmus_lock);
+	list_del_rcu(&pmu->entry);
+	mutex_unlock(&pmus_lock);
+
+	synchronize_srcu(&pmus_srcu);
+}
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+	struct pmu *pmu = NULL;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		int ret = pmu->event_init(event);
+		if (!ret)
+			break;
+		if (ret != -ENOENT) {
+			pmu = ERR_PTR(ret);
+			break;
 		}
-		pmu = &perf_ops_generic;
-		break;
 	}
+	srcu_read_unlock(&pmus_srcu, idx);
 
 	return pmu;
 }
@@ -5092,29 +5112,8 @@ perf_event_alloc(struct perf_event_attr *attr,
 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto done;
 
-	switch (attr->type) {
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-	case PERF_TYPE_HW_CACHE:
-		pmu = hw_perf_event_init(event);
-		break;
-
-	case PERF_TYPE_SOFTWARE:
-		pmu = sw_perf_event_init(event);
-		break;
-
-	case PERF_TYPE_TRACEPOINT:
-		pmu = tp_perf_event_init(event);
-		break;
+	pmu = perf_init_event(event);
 
-	case PERF_TYPE_BREAKPOINT:
-		pmu = bp_perf_event_init(event);
-		break;
-
-
-	default:
-		break;
-	}
 done:
 	err = 0;
 	if (!pmu)
@@ -5979,22 +5978,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-	.notifier_call		= perf_cpu_notify,
-	.priority		= 20,
-};
-
 void __init perf_event_init(void)
 {
 	perf_event_init_all_cpus();
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-			(void *)(long)smp_processor_id());
-	register_cpu_notifier(&perf_cpu_nb);
+	init_srcu_struct(&pmus_srcu);
+	perf_pmu_register(&perf_swevent);
+	perf_pmu_register(&perf_cpu_clock);
+	perf_pmu_register(&perf_task_clock);
+	perf_tp_register();
+	perf_cpu_notifier(perf_cpu_notify);
 }
 
 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-- 
cgit v1.2.3-70-g09d2


From 9ed6060d286b1eb55974d09080f442f809408c42 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 17:36:35 +0200
Subject: perf: Unindent labels

Fixup random annoying style bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 288ce43de57..149ca18371b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -147,7 +147,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 	struct perf_event_context *ctx;
 
 	rcu_read_lock();
- retry:
+retry:
 	ctx = rcu_dereference(task->perf_event_ctxp);
 	if (ctx) {
 		/*
@@ -619,7 +619,7 @@ void perf_event_disable(struct perf_event *event)
 		return;
 	}
 
- retry:
+retry:
 	task_oncpu_function_call(task, __perf_event_disable, event);
 
 	raw_spin_lock_irq(&ctx->lock);
@@ -849,7 +849,7 @@ static void __perf_install_in_context(void *info)
 	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
- unlock:
+unlock:
 	perf_enable();
 
 	raw_spin_unlock(&ctx->lock);
@@ -922,10 +922,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	event->tstamp_enabled = ctx->time - event->total_time_enabled;
-	list_for_each_entry(sub, &event->sibling_list, group_entry)
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 			sub->tstamp_enabled =
 				ctx->time - sub->total_time_enabled;
+		}
+	}
 }
 
 /*
@@ -991,7 +993,7 @@ static void __perf_event_enable(void *info)
 		}
 	}
 
- unlock:
+unlock:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1032,7 +1034,7 @@ void perf_event_enable(struct perf_event *event)
 	if (event->state == PERF_EVENT_STATE_ERROR)
 		event->state = PERF_EVENT_STATE_OFF;
 
- retry:
+retry:
 	raw_spin_unlock_irq(&ctx->lock);
 	task_oncpu_function_call(task, __perf_event_enable, event);
 
@@ -1052,7 +1054,7 @@ void perf_event_enable(struct perf_event *event)
 	if (event->state == PERF_EVENT_STATE_OFF)
 		__perf_event_mark_enabled(event, ctx);
 
- out:
+out:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -1092,17 +1094,19 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	if (!ctx->nr_active)
 		goto out_enable;
 
-	if (event_type & EVENT_PINNED)
+	if (event_type & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
+	}
 
-	if (event_type & EVENT_FLEXIBLE)
+	if (event_type & EVENT_FLEXIBLE) {
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
+	}
 
  out_enable:
 	perf_enable();
- out:
+out:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1341,9 +1345,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		if (event->cpu != -1 && event->cpu != smp_processor_id())
 			continue;
 
-		if (group_can_go_on(event, cpuctx, can_add_hw))
+		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
+		}
 	}
 }
 
@@ -1373,7 +1378,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 		ctx_flexible_sched_in(ctx, cpuctx);
 
 	perf_enable();
- out:
+out:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1714,7 +1719,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 	raw_spin_unlock(&ctx->lock);
 
 	perf_event_task_sched_in(task);
- out:
+out:
 	local_irq_restore(flags);
 }
 
@@ -2053,7 +2058,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto errout;
 
- retry:
+retry:
 	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
 		unclone_ctx(ctx);
@@ -2081,7 +2086,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 	put_task_struct(task);
 	return ctx;
 
- errout:
+errout:
 	put_task_struct(task);
 	return ERR_PTR(err);
 }
@@ -3264,7 +3269,7 @@ again:
 	if (handle->wakeup != local_read(&buffer->wakeup))
 		perf_output_wakeup(handle);
 
- out:
+out:
 	preempt_enable();
 }
 
@@ -4562,7 +4567,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
 	}
 	cpuctx->hlist_refcount++;
- exit:
+exit:
 	mutex_unlock(&cpuctx->hlist_mutex);
 
 	return err;
@@ -4587,7 +4592,7 @@ static int swevent_hlist_get(struct perf_event *event)
 	put_online_cpus();
 
 	return 0;
- fail:
+fail:
 	for_each_possible_cpu(cpu) {
 		if (cpu == failed_cpu)
 			break;
-- 
cgit v1.2.3-70-g09d2


From 24cd7f54a0d47e1d5b3de29e2456bfbd2d8447b7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 17:32:03 +0200
Subject: perf: Reduce perf_disable() usage

Since the current perf_disable() usage is only an optimization,
remove it for now. This eases the removal of the __weak
hw_perf_enable() interface.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/perf_event.c             |  3 +++
 arch/powerpc/kernel/perf_event.c         |  3 +++
 arch/powerpc/kernel/perf_event_fsl_emb.c |  8 +++++--
 arch/sh/kernel/perf_event.c              | 11 +++++++---
 arch/sparc/kernel/perf_event.c           |  3 +++
 arch/x86/kernel/cpu/perf_event.c         | 22 ++++++++++++-------
 include/linux/perf_event.h               | 20 ++++++++---------
 kernel/perf_event.c                      | 37 +-------------------------------
 8 files changed, 48 insertions(+), 59 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index f62f9db35db..afc92c580d1 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -277,6 +277,8 @@ armpmu_enable(struct perf_event *event)
 	int idx;
 	int err = 0;
 
+	perf_disable();
+
 	/* If we don't have a space for the counter then finish early. */
 	idx = armpmu->get_event_idx(cpuc, hwc);
 	if (idx < 0) {
@@ -303,6 +305,7 @@ armpmu_enable(struct perf_event *event)
 	perf_event_update_userpage(event);
 
 out:
+	perf_enable();
 	return err;
 }
 
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 19131b2614b..c1408821dbc 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -861,6 +861,7 @@ void power_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
+	perf_disable();
 	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
@@ -875,6 +876,7 @@ void power_pmu_cancel_txn(struct pmu *pmu)
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
+	perf_enable();
 }
 
 /*
@@ -901,6 +903,7 @@ int power_pmu_commit_txn(struct pmu *pmu)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
+	perf_enable();
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index ea6a804e43f..9bc84a7fd90 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -262,7 +262,7 @@ static int collect_events(struct perf_event *group, int max_count,
 	return n;
 }
 
-/* perf must be disabled, context locked on entry */
+/* context locked on entry */
 static int fsl_emb_pmu_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuhw;
@@ -271,6 +271,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 	u64 val;
 	int i;
 
+	perf_disable();
 	cpuhw = &get_cpu_var(cpu_hw_events);
 
 	if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
@@ -310,15 +311,17 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 	ret = 0;
  out:
 	put_cpu_var(cpu_hw_events);
+	perf_enable();
 	return ret;
 }
 
-/* perf must be disabled, context locked on entry */
+/* context locked on entry */
 static void fsl_emb_pmu_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuhw;
 	int i = event->hw.idx;
 
+	perf_disable();
 	if (i < 0)
 		goto out;
 
@@ -346,6 +349,7 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
 	cpuhw->n_events--;
 
  out:
+	perf_enable();
 	put_cpu_var(cpu_hw_events);
 }
 
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 8cb206597e0..d042989ceb4 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -230,11 +230,14 @@ static int sh_pmu_enable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
+	int ret = -EAGAIN;
+
+	perf_disable();
 
 	if (test_and_set_bit(idx, cpuc->used_mask)) {
 		idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
 		if (idx == sh_pmu->num_events)
-			return -EAGAIN;
+			goto out;
 
 		set_bit(idx, cpuc->used_mask);
 		hwc->idx = idx;
@@ -248,8 +251,10 @@ static int sh_pmu_enable(struct perf_event *event)
 	sh_pmu->enable(hwc, idx);
 
 	perf_event_update_userpage(event);
-
-	return 0;
+	ret = 0;
+out:
+	perf_enable();
+	return ret;
 }
 
 static void sh_pmu_read(struct perf_event *event)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index bed4327f5a7..d0131deeeaf 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1113,6 +1113,7 @@ static void sparc_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
+	perf_disable();
 	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
@@ -1126,6 +1127,7 @@ static void sparc_pmu_cancel_txn(struct pmu *pmu)
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
+	perf_enable();
 }
 
 /*
@@ -1149,6 +1151,7 @@ static int sparc_pmu_commit_txn(struct pmu *pmu)
 		return -EAGAIN;
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
+	perf_enable();
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2c89264ee79..846070ce49c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,10 +969,11 @@ static int x86_pmu_enable(struct perf_event *event)
 
 	hwc = &event->hw;
 
+	perf_disable();
 	n0 = cpuc->n_events;
-	n = collect_events(cpuc, event, false);
-	if (n < 0)
-		return n;
+	ret = n = collect_events(cpuc, event, false);
+	if (ret < 0)
+		goto out;
 
 	/*
 	 * If group events scheduling transaction was started,
@@ -980,23 +981,26 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * at commit time(->commit_txn) as a whole
 	 */
 	if (cpuc->group_flag & PERF_EVENT_TXN)
-		goto out;
+		goto done_collect;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
-		return ret;
+		goto out;
 	/*
 	 * copy new assignment, now we know it is possible
 	 * will be used by hw_perf_enable()
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-out:
+done_collect:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
 	cpuc->n_txn += n - n0;
 
-	return 0;
+	ret = 0;
+out:
+	perf_enable();
+	return ret;
 }
 
 static int x86_pmu_start(struct perf_event *event)
@@ -1432,6 +1436,7 @@ static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	perf_disable();
 	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
@@ -1451,6 +1456,7 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
 	 */
 	cpuc->n_added -= cpuc->n_txn;
 	cpuc->n_events -= cpuc->n_txn;
+	perf_enable();
 }
 
 /*
@@ -1480,7 +1486,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+	perf_enable();
 	return 0;
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ab72f56eb37..243286a8ded 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -564,26 +564,26 @@ struct pmu {
 	struct list_head		entry;
 
 	/*
-	 * Should return -ENOENT when the @event doesn't match this pmu
+	 * Should return -ENOENT when the @event doesn't match this PMU.
 	 */
 	int (*event_init)		(struct perf_event *event);
 
-	int (*enable)			(struct perf_event *event);
+	int  (*enable)			(struct perf_event *event);
 	void (*disable)			(struct perf_event *event);
-	int (*start)			(struct perf_event *event);
+	int  (*start)			(struct perf_event *event);
 	void (*stop)			(struct perf_event *event);
 	void (*read)			(struct perf_event *event);
 	void (*unthrottle)		(struct perf_event *event);
 
 	/*
-	 * Group events scheduling is treated as a transaction, add group
-	 * events as a whole and perform one schedulability test. If the test
-	 * fails, roll back the whole group
+	 * Group events scheduling is treated as a transaction, add
+	 * group events as a whole and perform one schedulability test.
+	 * If the test fails, roll back the whole group
 	 */
 
 	/*
-	 * Start the transaction, after this ->enable() doesn't need
-	 * to do schedulability tests.
+	 * Start the transaction, after this ->enable() doesn't need to
+	 * do schedulability tests.
 	 */
 	void (*start_txn)	(struct pmu *pmu);
 	/*
@@ -594,8 +594,8 @@ struct pmu {
 	 */
 	int  (*commit_txn)	(struct pmu *pmu);
 	/*
-	 * Will cancel the transaction, assumes ->disable() is called for
-	 * each successfull ->enable() during the transaction.
+	 * Will cancel the transaction, assumes ->disable() is called
+	 * for each successfull ->enable() during the transaction.
 	 */
 	void (*cancel_txn)	(struct pmu *pmu);
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 149ca18371b..9a98ce95356 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -478,11 +478,6 @@ static void __perf_event_remove_from_context(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * events on a global level.
-	 */
-	perf_disable();
 
 	event_sched_out(event, cpuctx, ctx);
 
@@ -498,7 +493,6 @@ static void __perf_event_remove_from_context(void *info)
 			    perf_max_events - perf_reserved_percpu);
 	}
 
-	perf_enable();
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -803,12 +797,6 @@ static void __perf_install_in_context(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * events on a global level. NOP for non NMI based events.
-	 */
-	perf_disable();
-
 	add_event_to_ctx(event, ctx);
 
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -850,8 +838,6 @@ static void __perf_install_in_context(void *info)
 		cpuctx->max_pertask--;
 
 unlock:
-	perf_enable();
-
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -972,12 +958,10 @@ static void __perf_event_enable(void *info)
 	if (!group_can_go_on(event, cpuctx, 1)) {
 		err = -EEXIST;
 	} else {
-		perf_disable();
 		if (event == leader)
 			err = group_sched_in(event, cpuctx, ctx);
 		else
 			err = event_sched_in(event, cpuctx, ctx);
-		perf_enable();
 	}
 
 	if (err) {
@@ -1090,9 +1074,8 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 		goto out;
 	update_context_time(ctx);
 
-	perf_disable();
 	if (!ctx->nr_active)
-		goto out_enable;
+		goto out;
 
 	if (event_type & EVENT_PINNED) {
 		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
@@ -1103,9 +1086,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 	}
-
- out_enable:
-	perf_enable();
 out:
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1364,8 +1344,6 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	ctx->timestamp = perf_clock();
 
-	perf_disable();
-
 	/*
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
@@ -1377,7 +1355,6 @@ ctx_sched_in(struct perf_event_context *ctx,
 	if (event_type & EVENT_FLEXIBLE)
 		ctx_flexible_sched_in(ctx, cpuctx);
 
-	perf_enable();
 out:
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1425,8 +1402,6 @@ void perf_event_task_sched_in(struct task_struct *task)
 	if (cpuctx->task_ctx == ctx)
 		return;
 
-	perf_disable();
-
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -1439,8 +1414,6 @@ void perf_event_task_sched_in(struct task_struct *task)
 	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 
 	cpuctx->task_ctx = ctx;
-
-	perf_enable();
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1555,11 +1528,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	hwc->sample_period = sample_period;
 
 	if (local64_read(&hwc->period_left) > 8*sample_period) {
-		perf_disable();
 		perf_event_stop(event);
 		local64_set(&hwc->period_left, 0);
 		perf_event_start(event);
-		perf_enable();
 	}
 }
 
@@ -1588,15 +1559,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(event, 1);
-			perf_disable();
 			event->pmu->unthrottle(event);
-			perf_enable();
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
 			continue;
 
-		perf_disable();
 		event->pmu->read(event);
 		now = local64_read(&event->count);
 		delta = now - hwc->freq_count_stamp;
@@ -1604,7 +1572,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 
 		if (delta > 0)
 			perf_adjust_period(event, TICK_NSEC, delta);
-		perf_enable();
 	}
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1647,7 +1614,6 @@ void perf_event_task_tick(struct task_struct *curr)
 	if (!rotate)
 		return;
 
-	perf_disable();
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1659,7 +1625,6 @@ void perf_event_task_tick(struct task_struct *curr)
 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
 		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-	perf_enable();
 }
 
 static int event_enable_on_exec(struct perf_event *event,
-- 
cgit v1.2.3-70-g09d2


From 33696fc0d141bbbcb12f75b69608ea83282e3117 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Jun 2010 08:49:00 +0200
Subject: perf: Per PMU disable

Changes perf_disable() into perf_pmu_disable().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c           | 30 +++++++++++++------------
 arch/arm/kernel/perf_event.c             | 28 +++++++++++------------
 arch/powerpc/kernel/perf_event.c         | 24 +++++++++++---------
 arch/powerpc/kernel/perf_event_fsl_emb.c | 18 ++++++++-------
 arch/sh/kernel/perf_event.c              | 38 +++++++++++++++++---------------
 arch/sparc/kernel/perf_event.c           | 20 +++++++++--------
 arch/x86/kernel/cpu/perf_event.c         | 16 ++++++++------
 include/linux/perf_event.h               | 13 ++++++-----
 kernel/perf_event.c                      | 31 ++++++++++++++++----------
 9 files changed, 119 insertions(+), 99 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 19660b5c298..3e260731f8e 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -435,7 +435,7 @@ static int alpha_pmu_enable(struct perf_event *event)
 	 * nevertheless we disable the PMCs first to enable a potential
 	 * final PMI to occur before we disable interrupts.
 	 */
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	local_irq_save(flags);
 
 	/* Default to error to be returned */
@@ -456,7 +456,7 @@ static int alpha_pmu_enable(struct perf_event *event)
 	}
 
 	local_irq_restore(flags);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 
 	return ret;
 }
@@ -474,7 +474,7 @@ static void alpha_pmu_disable(struct perf_event *event)
 	unsigned long flags;
 	int j;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	local_irq_save(flags);
 
 	for (j = 0; j < cpuc->n_events; j++) {
@@ -502,7 +502,7 @@ static void alpha_pmu_disable(struct perf_event *event)
 	}
 
 	local_irq_restore(flags);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 }
 
 
@@ -668,18 +668,10 @@ static int alpha_pmu_event_init(struct perf_event *event)
 	return err;
 }
 
-static struct pmu pmu = {
-	.event_init	= alpha_pmu_event_init,
-	.enable		= alpha_pmu_enable,
-	.disable	= alpha_pmu_disable,
-	.read		= alpha_pmu_read,
-	.unthrottle	= alpha_pmu_unthrottle,
-};
-
 /*
  * Main entry point - enable HW performance counters.
  */
-void hw_perf_enable(void)
+static void alpha_pmu_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -705,7 +697,7 @@ void hw_perf_enable(void)
  * Main entry point - disable HW performance counters.
  */
 
-void hw_perf_disable(void)
+static void alpha_pmu_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -718,6 +710,16 @@ void hw_perf_disable(void)
 	wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
 }
 
+static struct pmu pmu = {
+	.pmu_enable	= alpha_pmu_pmu_enable,
+	.pmu_disable	= alpha_pmu_pmu_disable,
+	.event_init	= alpha_pmu_event_init,
+	.enable		= alpha_pmu_enable,
+	.disable	= alpha_pmu_disable,
+	.read		= alpha_pmu_read,
+	.unthrottle	= alpha_pmu_unthrottle,
+};
+
 
 /*
  * Main entry point - don't know when this is called but it
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index afc92c580d1..3343f3f4b97 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -277,7 +277,7 @@ armpmu_enable(struct perf_event *event)
 	int idx;
 	int err = 0;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	/* If we don't have a space for the counter then finish early. */
 	idx = armpmu->get_event_idx(cpuc, hwc);
@@ -305,7 +305,7 @@ armpmu_enable(struct perf_event *event)
 	perf_event_update_userpage(event);
 
 out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	return err;
 }
 
@@ -534,16 +534,7 @@ static int armpmu_event_init(struct perf_event *event)
 	return err;
 }
 
-static struct pmu pmu = {
-	.event_init = armpmu_event_init,
-	.enable	    = armpmu_enable,
-	.disable    = armpmu_disable,
-	.unthrottle = armpmu_unthrottle,
-	.read	    = armpmu_read,
-};
-
-void
-hw_perf_enable(void)
+static void armpmu_pmu_enable(struct pmu *pmu)
 {
 	/* Enable all of the perf events on hardware. */
 	int idx;
@@ -564,13 +555,22 @@ hw_perf_enable(void)
 	armpmu->start();
 }
 
-void
-hw_perf_disable(void)
+static void armpmu_pmu_disable(struct pmu *pmu)
 {
 	if (armpmu)
 		armpmu->stop();
 }
 
+static struct pmu pmu = {
+	.pmu_enable = armpmu_pmu_enable,
+	.pmu_disable= armpmu_pmu_disable,
+	.event_init = armpmu_event_init,
+	.enable	    = armpmu_enable,
+	.disable    = armpmu_disable,
+	.unthrottle = armpmu_unthrottle,
+	.read	    = armpmu_read,
+};
+
 /*
  * ARMv6 Performance counter handling code.
  *
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index c1408821dbc..deb84bbcb0e 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -517,7 +517,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
  * Disable all events to prevent PMU interrupts and to allow
  * events to be added or removed.
  */
-void hw_perf_disable(void)
+static void power_pmu_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -565,7 +565,7 @@ void hw_perf_disable(void)
  * If we were previously disabled and events were added, then
  * put the new config on the PMU.
  */
-void hw_perf_enable(void)
+static void power_pmu_pmu_enable(struct pmu *pmu)
 {
 	struct perf_event *event;
 	struct cpu_hw_events *cpuhw;
@@ -735,7 +735,7 @@ static int power_pmu_enable(struct perf_event *event)
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	/*
 	 * Add the event to the list (if there is room)
@@ -769,7 +769,7 @@ nocheck:
 
 	ret = 0;
  out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 	return ret;
 }
@@ -784,7 +784,7 @@ static void power_pmu_disable(struct perf_event *event)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	power_pmu_read(event);
 
@@ -821,7 +821,7 @@ static void power_pmu_disable(struct perf_event *event)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
@@ -837,7 +837,7 @@ static void power_pmu_unthrottle(struct perf_event *event)
 	if (!event->hw.idx || !event->hw.sample_period)
 		return;
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	power_pmu_read(event);
 	left = event->hw.sample_period;
 	event->hw.last_period = left;
@@ -848,7 +848,7 @@ static void power_pmu_unthrottle(struct perf_event *event)
 	local64_set(&event->hw.prev_count, val);
 	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
@@ -861,7 +861,7 @@ void power_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	perf_disable();
+	perf_pmu_disable(pmu);
 	cpuhw->group_flag |= PERF_EVENT_TXN;
 	cpuhw->n_txn_start = cpuhw->n_events;
 }
@@ -876,7 +876,7 @@ void power_pmu_cancel_txn(struct pmu *pmu)
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
-	perf_enable();
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -903,7 +903,7 @@ int power_pmu_commit_txn(struct pmu *pmu)
 		cpuhw->event[i]->hw.config = cpuhw->events[i];
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
-	perf_enable();
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
@@ -1131,6 +1131,8 @@ static int power_pmu_event_init(struct perf_event *event)
 }
 
 struct pmu power_pmu = {
+	.pmu_enable	= power_pmu_pmu_enable,
+	.pmu_disable	= power_pmu_pmu_disable,
 	.event_init	= power_pmu_event_init,
 	.enable		= power_pmu_enable,
 	.disable	= power_pmu_disable,
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index 9bc84a7fd90..84b1974c628 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -177,7 +177,7 @@ static void fsl_emb_pmu_read(struct perf_event *event)
  * Disable all events to prevent PMU interrupts and to allow
  * events to be added or removed.
  */
-void hw_perf_disable(void)
+static void fsl_emb_pmu_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -216,7 +216,7 @@ void hw_perf_disable(void)
  * If we were previously disabled and events were added, then
  * put the new config on the PMU.
  */
-void hw_perf_enable(void)
+static void fsl_emb_pmu_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -271,7 +271,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 	u64 val;
 	int i;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	cpuhw = &get_cpu_var(cpu_hw_events);
 
 	if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
@@ -311,7 +311,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 	ret = 0;
  out:
 	put_cpu_var(cpu_hw_events);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
@@ -321,7 +321,7 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuhw;
 	int i = event->hw.idx;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	if (i < 0)
 		goto out;
 
@@ -349,7 +349,7 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
 	cpuhw->n_events--;
 
  out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	put_cpu_var(cpu_hw_events);
 }
 
@@ -367,7 +367,7 @@ static void fsl_emb_pmu_unthrottle(struct perf_event *event)
 	if (event->hw.idx < 0 || !event->hw.sample_period)
 		return;
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	fsl_emb_pmu_read(event);
 	left = event->hw.sample_period;
 	event->hw.last_period = left;
@@ -378,7 +378,7 @@ static void fsl_emb_pmu_unthrottle(struct perf_event *event)
 	local64_set(&event->hw.prev_count, val);
 	local64_set(&event->hw.period_left, left);
 	perf_event_update_userpage(event);
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
@@ -524,6 +524,8 @@ static int fsl_emb_pmu_event_init(struct perf_event *event)
 }
 
 static struct pmu fsl_emb_pmu = {
+	.pmu_enable	= fsl_emb_pmu_pmu_enable,
+	.pmu_disable	= fsl_emb_pmu_pmu_disable,
 	.event_init	= fsl_emb_pmu_event_init,
 	.enable		= fsl_emb_pmu_enable,
 	.disable	= fsl_emb_pmu_disable,
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index d042989ceb4..4bbe19058a5 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -232,7 +232,7 @@ static int sh_pmu_enable(struct perf_event *event)
 	int idx = hwc->idx;
 	int ret = -EAGAIN;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	if (test_and_set_bit(idx, cpuc->used_mask)) {
 		idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
@@ -253,7 +253,7 @@ static int sh_pmu_enable(struct perf_event *event)
 	perf_event_update_userpage(event);
 	ret = 0;
 out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
@@ -285,7 +285,25 @@ static int sh_pmu_event_init(struct perf_event *event)
 	return err;
 }
 
+static void sh_pmu_pmu_enable(struct pmu *pmu)
+{
+	if (!sh_pmu_initialized())
+		return;
+
+	sh_pmu->enable_all();
+}
+
+static void sh_pmu_pmu_disable(struct pmu *pmu)
+{
+	if (!sh_pmu_initialized())
+		return;
+
+	sh_pmu->disable_all();
+}
+
 static struct pmu pmu = {
+	.pmu_enable	= sh_pmu_pmu_enable,
+	.pmu_disable	= sh_pmu_pmu_disable,
 	.event_init	= sh_pmu_event_init,
 	.enable		= sh_pmu_enable,
 	.disable	= sh_pmu_disable,
@@ -316,22 +334,6 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-void hw_perf_enable(void)
-{
-	if (!sh_pmu_initialized())
-		return;
-
-	sh_pmu->enable_all();
-}
-
-void hw_perf_disable(void)
-{
-	if (!sh_pmu_initialized())
-		return;
-
-	sh_pmu->disable_all();
-}
-
 int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
 {
 	if (sh_pmu)
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index d0131deeeaf..37cae676536 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -664,7 +664,7 @@ out:
 	return pcr;
 }
 
-void hw_perf_enable(void)
+static void sparc_pmu_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	u64 pcr;
@@ -691,7 +691,7 @@ void hw_perf_enable(void)
 	pcr_ops->write(cpuc->pcr);
 }
 
-void hw_perf_disable(void)
+static void sparc_pmu_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	u64 val;
@@ -718,7 +718,7 @@ static void sparc_pmu_disable(struct perf_event *event)
 	int i;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event[i]) {
@@ -748,7 +748,7 @@ static void sparc_pmu_disable(struct perf_event *event)
 		}
 	}
 
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 }
 
@@ -991,7 +991,7 @@ static int sparc_pmu_enable(struct perf_event *event)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 
 	n0 = cpuc->n_events;
 	if (n0 >= perf_max_events)
@@ -1020,7 +1020,7 @@ nocheck:
 
 	ret = 0;
 out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
 	return ret;
 }
@@ -1113,7 +1113,7 @@ static void sparc_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
-	perf_disable();
+	perf_pmu_disable(pmu);
 	cpuhw->group_flag |= PERF_EVENT_TXN;
 }
 
@@ -1127,7 +1127,7 @@ static void sparc_pmu_cancel_txn(struct pmu *pmu)
 	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
 
 	cpuhw->group_flag &= ~PERF_EVENT_TXN;
-	perf_enable();
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -1151,11 +1151,13 @@ static int sparc_pmu_commit_txn(struct pmu *pmu)
 		return -EAGAIN;
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
-	perf_enable();
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
 static struct pmu pmu = {
+	.pmu_enable	= sparc_pmu_pmu_enable,
+	.pmu_disable	= sparc_pmu_pmu_disable,
 	.event_init	= sparc_pmu_event_init,
 	.enable		= sparc_pmu_enable,
 	.disable	= sparc_pmu_disable,
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 846070ce49c..79705ac4501 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -583,7 +583,7 @@ static void x86_pmu_disable_all(void)
 	}
 }
 
-void hw_perf_disable(void)
+static void x86_pmu_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -803,7 +803,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
 static int x86_pmu_start(struct perf_event *event);
 static void x86_pmu_stop(struct perf_event *event);
 
-void hw_perf_enable(void)
+static void x86_pmu_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 
 	hwc = &event->hw;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
@@ -999,7 +999,7 @@ done_collect:
 
 	ret = 0;
 out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
@@ -1436,7 +1436,7 @@ static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	perf_disable();
+	perf_pmu_disable(pmu);
 	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
@@ -1456,7 +1456,7 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
 	 */
 	cpuc->n_added -= cpuc->n_txn;
 	cpuc->n_events -= cpuc->n_txn;
-	perf_enable();
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -1486,7 +1486,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
-	perf_enable();
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
@@ -1605,6 +1605,8 @@ int x86_pmu_event_init(struct perf_event *event)
 }
 
 static struct pmu pmu = {
+	.pmu_enable	= x86_pmu_pmu_enable,
+	.pmu_disable	= x86_pmu_pmu_disable,
 	.event_init	= x86_pmu_event_init,
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 243286a8ded..6abf103fb7f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -563,6 +563,11 @@ struct perf_event;
 struct pmu {
 	struct list_head		entry;
 
+	int				*pmu_disable_count;
+
+	void (*pmu_enable)		(struct pmu *pmu);
+	void (*pmu_disable)		(struct pmu *pmu);
+
 	/*
 	 * Should return -ENOENT when the @event doesn't match this PMU.
 	 */
@@ -868,10 +873,8 @@ extern void perf_event_free_task(struct task_struct *task);
 extern void set_perf_event_pending(void);
 extern void perf_event_do_pending(void);
 extern void perf_event_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
+extern void perf_pmu_disable(struct pmu *pmu);
+extern void perf_pmu_enable(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern void perf_event_update_userpage(struct perf_event *event);
@@ -1056,8 +1059,6 @@ static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_do_pending(void)				{ }
 static inline void perf_event_print_debug(void)				{ }
-static inline void perf_disable(void)					{ }
-static inline void perf_enable(void)					{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9a98ce95356..5ed0c06765b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -71,23 +71,20 @@ static atomic64_t perf_event_id;
  */
 static DEFINE_SPINLOCK(perf_resource_lock);
 
-void __weak hw_perf_disable(void)		{ barrier(); }
-void __weak hw_perf_enable(void)		{ barrier(); }
-
 void __weak perf_event_print_debug(void)	{ }
 
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void perf_disable(void)
+void perf_pmu_disable(struct pmu *pmu)
 {
-	if (!__get_cpu_var(perf_disable_count)++)
-		hw_perf_disable();
+	int *count = this_cpu_ptr(pmu->pmu_disable_count);
+	if (!(*count)++)
+		pmu->pmu_disable(pmu);
 }
 
-void perf_enable(void)
+void perf_pmu_enable(struct pmu *pmu)
 {
-	if (!--__get_cpu_var(perf_disable_count))
-		hw_perf_enable();
+	int *count = this_cpu_ptr(pmu->pmu_disable_count);
+	if (!--(*count))
+		pmu->pmu_enable(pmu);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -4970,11 +4967,19 @@ static struct srcu_struct pmus_srcu;
 
 int perf_pmu_register(struct pmu *pmu)
 {
+	int ret;
+
 	mutex_lock(&pmus_lock);
+	ret = -ENOMEM;
+	pmu->pmu_disable_count = alloc_percpu(int);
+	if (!pmu->pmu_disable_count)
+		goto unlock;
 	list_add_rcu(&pmu->entry, &pmus);
+	ret = 0;
+unlock:
 	mutex_unlock(&pmus_lock);
 
-	return 0;
+	return ret;
 }
 
 void perf_pmu_unregister(struct pmu *pmu)
@@ -4984,6 +4989,8 @@ void perf_pmu_unregister(struct pmu *pmu)
 	mutex_unlock(&pmus_lock);
 
 	synchronize_srcu(&pmus_srcu);
+
+	free_percpu(pmu->pmu_disable_count);
 }
 
 struct pmu *perf_init_event(struct perf_event *event)
-- 
cgit v1.2.3-70-g09d2


From ad5133b7030d04ce7701aa7cbe98f561347c79c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Jun 2010 12:22:39 +0200
Subject: perf: Default PMU ops

Provide default implementations for the pmu txn methods, this
allows us to remove some conditional code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 10 ++++----
 kernel/perf_event.c        | 64 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 57 insertions(+), 17 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6abf103fb7f..bf85733597e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -565,8 +565,8 @@ struct pmu {
 
 	int				*pmu_disable_count;
 
-	void (*pmu_enable)		(struct pmu *pmu);
-	void (*pmu_disable)		(struct pmu *pmu);
+	void (*pmu_enable)		(struct pmu *pmu); /* optional */
+	void (*pmu_disable)		(struct pmu *pmu); /* optional */
 
 	/*
 	 * Should return -ENOENT when the @event doesn't match this PMU.
@@ -590,19 +590,19 @@ struct pmu {
 	 * Start the transaction, after this ->enable() doesn't need to
 	 * do schedulability tests.
 	 */
-	void (*start_txn)	(struct pmu *pmu);
+	void (*start_txn)	(struct pmu *pmu); /* optional */
 	/*
 	 * If ->start_txn() disabled the ->enable() schedulability test
 	 * then ->commit_txn() is required to perform one. On success
 	 * the transaction is closed. On error the transaction is kept
 	 * open until ->cancel_txn() is called.
 	 */
-	int  (*commit_txn)	(struct pmu *pmu);
+	int  (*commit_txn)	(struct pmu *pmu); /* optional */
 	/*
 	 * Will cancel the transaction, assumes ->disable() is called
 	 * for each successfull ->enable() during the transaction.
 	 */
-	void (*cancel_txn)	(struct pmu *pmu);
+	void (*cancel_txn)	(struct pmu *pmu); /* optional */
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 5ed0c06765b..8ef4ba3bcb1 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -674,21 +674,14 @@ group_sched_in(struct perf_event *group_event,
 {
 	struct perf_event *event, *partial_group = NULL;
 	struct pmu *pmu = group_event->pmu;
-	bool txn = false;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	/* Check if group transaction availabe */
-	if (pmu->start_txn)
-		txn = true;
-
-	if (txn)
-		pmu->start_txn(pmu);
+	pmu->start_txn(pmu);
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
-		if (txn)
-			pmu->cancel_txn(pmu);
+		pmu->cancel_txn(pmu);
 		return -EAGAIN;
 	}
 
@@ -702,7 +695,7 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!txn || !pmu->commit_txn(pmu))
+	if (!pmu->commit_txn(pmu))
 		return 0;
 
 group_error:
@@ -717,8 +710,7 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
-	if (txn)
-		pmu->cancel_txn(pmu);
+	pmu->cancel_txn(pmu);
 
 	return -EAGAIN;
 }
@@ -4965,6 +4957,31 @@ static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
 static struct srcu_struct pmus_srcu;
 
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+	return 0;
+}
+
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+	perf_pmu_disable(pmu);
+}
+
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+	perf_pmu_enable(pmu);
+	return 0;
+}
+
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+	perf_pmu_enable(pmu);
+}
+
 int perf_pmu_register(struct pmu *pmu)
 {
 	int ret;
@@ -4974,6 +4991,29 @@ int perf_pmu_register(struct pmu *pmu)
 	pmu->pmu_disable_count = alloc_percpu(int);
 	if (!pmu->pmu_disable_count)
 		goto unlock;
+
+	if (!pmu->start_txn) {
+		if (pmu->pmu_enable) {
+			/*
+			 * If we have pmu_enable/pmu_disable calls, install
+			 * transaction stubs that use that to try and batch
+			 * hardware accesses.
+			 */
+			pmu->start_txn  = perf_pmu_start_txn;
+			pmu->commit_txn = perf_pmu_commit_txn;
+			pmu->cancel_txn = perf_pmu_cancel_txn;
+		} else {
+			pmu->start_txn  = perf_pmu_nop_void;
+			pmu->commit_txn = perf_pmu_nop_int;
+			pmu->cancel_txn = perf_pmu_nop_void;
+		}
+	}
+
+	if (!pmu->pmu_enable) {
+		pmu->pmu_enable  = perf_pmu_nop_void;
+		pmu->pmu_disable = perf_pmu_nop_void;
+	}
+
 	list_add_rcu(&pmu->entry, &pmus);
 	ret = 0;
 unlock:
-- 
cgit v1.2.3-70-g09d2


From fa407f35e0298d841e4088f95a7f9cf6e725c6d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 24 Jun 2010 12:35:12 +0200
Subject: perf: Shrink hw_perf_event

Use hw_perf_event::period_left instead of hw_perf_event::remaining
and win back 8 bytes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  1 -
 kernel/perf_event.c        | 13 ++++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index bf85733597e..8cafa15af60 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -529,7 +529,6 @@ struct hw_perf_event {
 			int		last_cpu;
 		};
 		struct { /* software */
-			s64		remaining;
 			struct hrtimer	hrtimer;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8ef4ba3bcb1..1a6cdbf0d09 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4800,14 +4800,13 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swevent_hrtimer;
 	if (hwc->sample_period) {
-		u64 period;
+		s64 period = local64_read(&hwc->period_left);
 
-		if (hwc->remaining) {
-			if (hwc->remaining < 0)
+		if (period) {
+			if (period < 0)
 				period = 10000;
-			else
-				period = hwc->remaining;
-			hwc->remaining = 0;
+
+			local64_set(&hwc->period_left, 0);
 		} else {
 			period = max_t(u64, 10000, hwc->sample_period);
 		}
@@ -4823,7 +4822,7 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 
 	if (hwc->sample_period) {
 		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-		hwc->remaining = ktime_to_ns(remaining);
+		local64_set(&hwc->period_left, ktime_to_ns(remaining));
 
 		hrtimer_cancel(&hwc->hrtimer);
 	}
-- 
cgit v1.2.3-70-g09d2


From a4eaf7f14675cb512d69f0c928055e73d0c6d252 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Jun 2010 14:37:10 +0200
Subject: perf: Rework the PMU methods

Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.

The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.

This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).

It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).

The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:

 1) We disable the counter:
    a) the pmu has per-counter enable bits, we flip that
    b) we program a NOP event, preserving the counter state

 2) We store the counter state and ignore all read/overflow events

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c            |  71 +++++++++++----
 arch/arm/kernel/perf_event.c              |  96 ++++++++++++--------
 arch/powerpc/kernel/perf_event.c          | 105 ++++++++++++++--------
 arch/powerpc/kernel/perf_event_fsl_emb.c  | 107 ++++++++++++++---------
 arch/sh/kernel/perf_event.c               |  75 +++++++++++-----
 arch/sparc/kernel/perf_event.c            | 109 ++++++++++++++---------
 arch/x86/kernel/cpu/perf_event.c          | 106 ++++++++++++----------
 arch/x86/kernel/cpu/perf_event_intel.c    |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   2 +-
 include/linux/ftrace_event.h              |   4 +-
 include/linux/perf_event.h                |  54 +++++++++---
 kernel/hw_breakpoint.c                    |  29 ++++++-
 kernel/perf_event.c                       | 140 +++++++++++++++---------------
 kernel/trace/trace_event_perf.c           |   7 +-
 14 files changed, 576 insertions(+), 331 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 3e260731f8e..380ef02d557 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -307,7 +307,7 @@ again:
 			     new_raw_count) != prev_raw_count)
 		goto again;
 
-	delta = (new_raw_count  - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
+	delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
 
 	/* It is possible on very rare occasions that the PMC has overflowed
 	 * but the interrupt is yet to come.  Detect and fix this situation.
@@ -402,14 +402,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
 		struct hw_perf_event *hwc = &pe->hw;
 		int idx = hwc->idx;
 
-		if (cpuc->current_idx[j] != PMC_NO_INDEX) {
-			cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
-			continue;
+		if (cpuc->current_idx[j] == PMC_NO_INDEX) {
+			alpha_perf_event_set_period(pe, hwc, idx);
+			cpuc->current_idx[j] = idx;
 		}
 
-		alpha_perf_event_set_period(pe, hwc, idx);
-		cpuc->current_idx[j] = idx;
-		cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
+		if (!(hwc->state & PERF_HES_STOPPED))
+			cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
 	}
 	cpuc->config = cpuc->event[0]->hw.config_base;
 }
@@ -420,7 +419,7 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
  *  - this function is called from outside this module via the pmu struct
  *    returned from perf event initialisation.
  */
-static int alpha_pmu_enable(struct perf_event *event)
+static int alpha_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int n0;
@@ -455,6 +454,10 @@ static int alpha_pmu_enable(struct perf_event *event)
 		}
 	}
 
+	hwc->state = PERF_HES_UPTODATE;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_STOPPED;
+
 	local_irq_restore(flags);
 	perf_pmu_enable(event->pmu);
 
@@ -467,7 +470,7 @@ static int alpha_pmu_enable(struct perf_event *event)
  *  - this function is called from outside this module via the pmu struct
  *    returned from perf event initialisation.
  */
-static void alpha_pmu_disable(struct perf_event *event)
+static void alpha_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
@@ -514,13 +517,44 @@ static void alpha_pmu_read(struct perf_event *event)
 }
 
 
-static void alpha_pmu_unthrottle(struct perf_event *event)
+static void alpha_pmu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		cpuc->idx_mask &= !(1UL<<hwc->idx);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		alpha_perf_event_update(event, hwc, hwc->idx, 0);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	if (cpuc->enabled)
+		wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
+}
+
+
+static void alpha_pmu_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+		alpha_perf_event_set_period(event, hwc, hwc->idx);
+	}
+
+	hwc->state = 0;
+
 	cpuc->idx_mask |= 1UL<<hwc->idx;
-	wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
+	if (cpuc->enabled)
+		wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
 }
 
 
@@ -671,7 +705,7 @@ static int alpha_pmu_event_init(struct perf_event *event)
 /*
  * Main entry point - enable HW performance counters.
  */
-static void alpha_pmu_pmu_enable(struct pmu *pmu)
+static void alpha_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -697,7 +731,7 @@ static void alpha_pmu_pmu_enable(struct pmu *pmu)
  * Main entry point - disable HW performance counters.
  */
 
-static void alpha_pmu_pmu_disable(struct pmu *pmu)
+static void alpha_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -711,13 +745,14 @@ static void alpha_pmu_pmu_disable(struct pmu *pmu)
 }
 
 static struct pmu pmu = {
-	.pmu_enable	= alpha_pmu_pmu_enable,
-	.pmu_disable	= alpha_pmu_pmu_disable,
+	.pmu_enable	= alpha_pmu_enable,
+	.pmu_disable	= alpha_pmu_disable,
 	.event_init	= alpha_pmu_event_init,
-	.enable		= alpha_pmu_enable,
-	.disable	= alpha_pmu_disable,
+	.add		= alpha_pmu_add,
+	.del		= alpha_pmu_del,
+	.start		= alpha_pmu_start,
+	.stop		= alpha_pmu_stop,
 	.read		= alpha_pmu_read,
-	.unthrottle	= alpha_pmu_unthrottle,
 };
 
 
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 3343f3f4b97..448cfa6b3ef 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -221,46 +221,56 @@ again:
 }
 
 static void
-armpmu_disable(struct perf_event *event)
+armpmu_read(struct perf_event *event)
 {
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	WARN_ON(idx < 0);
-
-	clear_bit(idx, cpuc->active_mask);
-	armpmu->disable(hwc, idx);
-
-	barrier();
 
-	armpmu_event_update(event, hwc, idx);
-	cpuc->events[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
+	/* Don't read disabled counters! */
+	if (hwc->idx < 0)
+		return;
 
-	perf_event_update_userpage(event);
+	armpmu_event_update(event, hwc, hwc->idx);
 }
 
 static void
-armpmu_read(struct perf_event *event)
+armpmu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	/* Don't read disabled counters! */
-	if (hwc->idx < 0)
+	if (!armpmu)
 		return;
 
-	armpmu_event_update(event, hwc, hwc->idx);
+	/*
+	 * ARM pmu always has to update the counter, so ignore
+	 * PERF_EF_UPDATE, see comments in armpmu_start().
+	 */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		armpmu->disable(hwc, hwc->idx);
+		barrier(); /* why? */
+		armpmu_event_update(event, hwc, hwc->idx);
+		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	}
 }
 
 static void
-armpmu_unthrottle(struct perf_event *event)
+armpmu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
+	if (!armpmu)
+		return;
+
+	/*
+	 * ARM pmu always has to reprogram the period, so ignore
+	 * PERF_EF_RELOAD, see the comment below.
+	 */
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
 	/*
 	 * Set the period again. Some counters can't be stopped, so when we
-	 * were throttled we simply disabled the IRQ source and the counter
+	 * were stopped we simply disabled the IRQ source and the counter
 	 * may have been left counting. If we don't do this step then we may
 	 * get an interrupt too soon or *way* too late if the overflow has
 	 * happened since disabling.
@@ -269,8 +279,25 @@ armpmu_unthrottle(struct perf_event *event)
 	armpmu->enable(hwc, hwc->idx);
 }
 
+static void
+armpmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	WARN_ON(idx < 0);
+
+	clear_bit(idx, cpuc->active_mask);
+	armpmu_stop(event, PERF_EF_UPDATE);
+	cpuc->events[idx] = NULL;
+	clear_bit(idx, cpuc->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
 static int
-armpmu_enable(struct perf_event *event)
+armpmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
@@ -295,11 +322,9 @@ armpmu_enable(struct perf_event *event)
 	cpuc->events[idx] = event;
 	set_bit(idx, cpuc->active_mask);
 
-	/* Set the period for the event. */
-	armpmu_event_set_period(event, hwc, idx);
-
-	/* Enable the event. */
-	armpmu->enable(hwc, idx);
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	if (flags & PERF_EF_START)
+		armpmu_start(event, PERF_EF_RELOAD);
 
 	/* Propagate our changes to the userspace mapping. */
 	perf_event_update_userpage(event);
@@ -534,7 +559,7 @@ static int armpmu_event_init(struct perf_event *event)
 	return err;
 }
 
-static void armpmu_pmu_enable(struct pmu *pmu)
+static void armpmu_enable(struct pmu *pmu)
 {
 	/* Enable all of the perf events on hardware. */
 	int idx;
@@ -555,20 +580,21 @@ static void armpmu_pmu_enable(struct pmu *pmu)
 	armpmu->start();
 }
 
-static void armpmu_pmu_disable(struct pmu *pmu)
+static void armpmu_disable(struct pmu *pmu)
 {
 	if (armpmu)
 		armpmu->stop();
 }
 
 static struct pmu pmu = {
-	.pmu_enable = armpmu_pmu_enable,
-	.pmu_disable= armpmu_pmu_disable,
-	.event_init = armpmu_event_init,
-	.enable	    = armpmu_enable,
-	.disable    = armpmu_disable,
-	.unthrottle = armpmu_unthrottle,
-	.read	    = armpmu_read,
+	.pmu_enable	= armpmu_enable,
+	.pmu_disable	= armpmu_disable,
+	.event_init	= armpmu_event_init,
+	.add		= armpmu_add,
+	.del		= armpmu_del,
+	.start		= armpmu_start,
+	.stop		= armpmu_stop,
+	.read		= armpmu_read,
 };
 
 /*
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index deb84bbcb0e..9cb4924b6c0 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -402,6 +402,9 @@ static void power_pmu_read(struct perf_event *event)
 {
 	s64 val, delta, prev;
 
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	if (!event->hw.idx)
 		return;
 	/*
@@ -517,7 +520,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
  * Disable all events to prevent PMU interrupts and to allow
  * events to be added or removed.
  */
-static void power_pmu_pmu_disable(struct pmu *pmu)
+static void power_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -565,7 +568,7 @@ static void power_pmu_pmu_disable(struct pmu *pmu)
  * If we were previously disabled and events were added, then
  * put the new config on the PMU.
  */
-static void power_pmu_pmu_enable(struct pmu *pmu)
+static void power_pmu_enable(struct pmu *pmu)
 {
 	struct perf_event *event;
 	struct cpu_hw_events *cpuhw;
@@ -672,6 +675,8 @@ static void power_pmu_pmu_enable(struct pmu *pmu)
 		}
 		local64_set(&event->hw.prev_count, val);
 		event->hw.idx = idx;
+		if (event->hw.state & PERF_HES_STOPPED)
+			val = 0;
 		write_pmc(idx, val);
 		perf_event_update_userpage(event);
 	}
@@ -727,7 +732,7 @@ static int collect_events(struct perf_event *group, int max_count,
  * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_pmu_enable(struct perf_event *event)
+static int power_pmu_add(struct perf_event *event, int ef_flags)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -749,6 +754,9 @@ static int power_pmu_enable(struct perf_event *event)
 	cpuhw->events[n0] = event->hw.config;
 	cpuhw->flags[n0] = event->hw.event_base;
 
+	if (!(ef_flags & PERF_EF_START))
+		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
@@ -777,7 +785,7 @@ nocheck:
 /*
  * Remove a event from the PMU.
  */
-static void power_pmu_disable(struct perf_event *event)
+static void power_pmu_del(struct perf_event *event, int ef_flags)
 {
 	struct cpu_hw_events *cpuhw;
 	long i;
@@ -826,27 +834,53 @@ static void power_pmu_disable(struct perf_event *event)
 }
 
 /*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
+ * POWER-PMU does not support disabling individual counters, hence
+ * program their cycle counter to their max value and ignore the interrupts.
  */
-static void power_pmu_unthrottle(struct perf_event *event)
+
+static void power_pmu_start(struct perf_event *event, int ef_flags)
 {
-	s64 val, left;
 	unsigned long flags;
+	s64 left;
 
 	if (!event->hw.idx || !event->hw.sample_period)
 		return;
+
+	if (!(event->hw.state & PERF_HES_STOPPED))
+		return;
+
+	if (ef_flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+	local_irq_save(flags);
+	perf_pmu_disable(event->pmu);
+
+	event->hw.state = 0;
+	left = local64_read(&event->hw.period_left);
+	write_pmc(event->hw.idx, left);
+
+	perf_event_update_userpage(event);
+	perf_pmu_enable(event->pmu);
+	local_irq_restore(flags);
+}
+
+static void power_pmu_stop(struct perf_event *event, int ef_flags)
+{
+	unsigned long flags;
+
+	if (!event->hw.idx || !event->hw.sample_period)
+		return;
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	local_irq_save(flags);
 	perf_pmu_disable(event->pmu);
+
 	power_pmu_read(event);
-	left = event->hw.sample_period;
-	event->hw.last_period = left;
-	val = 0;
-	if (left < 0x80000000L)
-		val = 0x80000000L - left;
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
+	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	write_pmc(event->hw.idx, 0);
+
 	perf_event_update_userpage(event);
 	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
@@ -1131,13 +1165,14 @@ static int power_pmu_event_init(struct perf_event *event)
 }
 
 struct pmu power_pmu = {
-	.pmu_enable	= power_pmu_pmu_enable,
-	.pmu_disable	= power_pmu_pmu_disable,
+	.pmu_enable	= power_pmu_enable,
+	.pmu_disable	= power_pmu_disable,
 	.event_init	= power_pmu_event_init,
-	.enable		= power_pmu_enable,
-	.disable	= power_pmu_disable,
+	.add		= power_pmu_add,
+	.del		= power_pmu_del,
+	.start		= power_pmu_start,
+	.stop		= power_pmu_stop,
 	.read		= power_pmu_read,
-	.unthrottle	= power_pmu_unthrottle,
 	.start_txn	= power_pmu_start_txn,
 	.cancel_txn	= power_pmu_cancel_txn,
 	.commit_txn	= power_pmu_commit_txn,
@@ -1155,6 +1190,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	s64 prev, delta, left;
 	int record = 0;
 
+	if (event->hw.state & PERF_HES_STOPPED) {
+		write_pmc(event->hw.idx, 0);
+		return;
+	}
+
 	/* we don't have to worry about interrupts here */
 	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
@@ -1177,6 +1217,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			val = 0x80000000LL - left;
 	}
 
+	write_pmc(event->hw.idx, val);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+
 	/*
 	 * Finally record data if requested.
 	 */
@@ -1189,23 +1234,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
 			perf_get_data_addr(regs, &data.addr);
 
-		if (perf_event_overflow(event, nmi, &data, regs)) {
-			/*
-			 * Interrupts are coming too fast - throttle them
-			 * by setting the event to 0, so it will be
-			 * at least 2^30 cycles until the next interrupt
-			 * (assuming each event counts at most 2 counts
-			 * per cycle).
-			 */
-			val = 0;
-			left = ~0ULL >> 1;
-		}
+		if (perf_event_overflow(event, nmi, &data, regs))
+			power_pmu_stop(event, 0);
 	}
-
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
-	perf_event_update_userpage(event);
 }
 
 /*
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
index 84b1974c628..7ecca59ddf7 100644
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -156,6 +156,9 @@ static void fsl_emb_pmu_read(struct perf_event *event)
 {
 	s64 val, delta, prev;
 
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	/*
 	 * Performance monitor interrupts come even when interrupts
 	 * are soft-disabled, as long as interrupts are hard-enabled.
@@ -177,7 +180,7 @@ static void fsl_emb_pmu_read(struct perf_event *event)
  * Disable all events to prevent PMU interrupts and to allow
  * events to be added or removed.
  */
-static void fsl_emb_pmu_pmu_disable(struct pmu *pmu)
+static void fsl_emb_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -216,7 +219,7 @@ static void fsl_emb_pmu_pmu_disable(struct pmu *pmu)
  * If we were previously disabled and events were added, then
  * put the new config on the PMU.
  */
-static void fsl_emb_pmu_pmu_enable(struct pmu *pmu)
+static void fsl_emb_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuhw;
 	unsigned long flags;
@@ -263,7 +266,7 @@ static int collect_events(struct perf_event *group, int max_count,
 }
 
 /* context locked on entry */
-static int fsl_emb_pmu_enable(struct perf_event *event)
+static int fsl_emb_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuhw;
 	int ret = -EAGAIN;
@@ -302,6 +305,12 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 			val = 0x80000000L - left;
 	}
 	local64_set(&event->hw.prev_count, val);
+
+	if (!(flags & PERF_EF_START)) {
+		event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+		val = 0;
+	}
+
 	write_pmc(i, val);
 	perf_event_update_userpage(event);
 
@@ -316,7 +325,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
 }
 
 /* context locked on entry */
-static void fsl_emb_pmu_disable(struct perf_event *event)
+static void fsl_emb_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuhw;
 	int i = event->hw.idx;
@@ -353,30 +362,49 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
 	put_cpu_var(cpu_hw_events);
 }
 
-/*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
- *
- * Context is locked on entry, but perf is not disabled.
- */
-static void fsl_emb_pmu_unthrottle(struct perf_event *event)
+static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)
+{
+	unsigned long flags;
+	s64 left;
+
+	if (event->hw.idx < 0 || !event->hw.sample_period)
+		return;
+
+	if (!(event->hw.state & PERF_HES_STOPPED))
+		return;
+
+	if (ef_flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+	local_irq_save(flags);
+	perf_pmu_disable(event->pmu);
+
+	event->hw.state = 0;
+	left = local64_read(&event->hw.period_left);
+	write_pmc(event->hw.idx, left);
+
+	perf_event_update_userpage(event);
+	perf_pmu_enable(event->pmu);
+	local_irq_restore(flags);
+}
+
+static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags)
 {
-	s64 val, left;
 	unsigned long flags;
 
 	if (event->hw.idx < 0 || !event->hw.sample_period)
 		return;
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
 	local_irq_save(flags);
 	perf_pmu_disable(event->pmu);
+
 	fsl_emb_pmu_read(event);
-	left = event->hw.sample_period;
-	event->hw.last_period = left;
-	val = 0;
-	if (left < 0x80000000L)
-		val = 0x80000000L - left;
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
+	event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	write_pmc(event->hw.idx, 0);
+
 	perf_event_update_userpage(event);
 	perf_pmu_enable(event->pmu);
 	local_irq_restore(flags);
@@ -524,13 +552,14 @@ static int fsl_emb_pmu_event_init(struct perf_event *event)
 }
 
 static struct pmu fsl_emb_pmu = {
-	.pmu_enable	= fsl_emb_pmu_pmu_enable,
-	.pmu_disable	= fsl_emb_pmu_pmu_disable,
+	.pmu_enable	= fsl_emb_pmu_enable,
+	.pmu_disable	= fsl_emb_pmu_disable,
 	.event_init	= fsl_emb_pmu_event_init,
-	.enable		= fsl_emb_pmu_enable,
-	.disable	= fsl_emb_pmu_disable,
+	.add		= fsl_emb_pmu_add,
+	.del		= fsl_emb_pmu_del,
+	.start		= fsl_emb_pmu_start,
+	.stop		= fsl_emb_pmu_stop,
 	.read		= fsl_emb_pmu_read,
-	.unthrottle	= fsl_emb_pmu_unthrottle,
 };
 
 /*
@@ -545,6 +574,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 	s64 prev, delta, left;
 	int record = 0;
 
+	if (event->hw.state & PERF_HES_STOPPED) {
+		write_pmc(event->hw.idx, 0);
+		return;
+	}
+
 	/* we don't have to worry about interrupts here */
 	prev = local64_read(&event->hw.prev_count);
 	delta = (val - prev) & 0xfffffffful;
@@ -567,6 +601,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			val = 0x80000000LL - left;
 	}
 
+	write_pmc(event->hw.idx, val);
+	local64_set(&event->hw.prev_count, val);
+	local64_set(&event->hw.period_left, left);
+	perf_event_update_userpage(event);
+
 	/*
 	 * Finally record data if requested.
 	 */
@@ -576,23 +615,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
-		if (perf_event_overflow(event, nmi, &data, regs)) {
-			/*
-			 * Interrupts are coming too fast - throttle them
-			 * by setting the event to 0, so it will be
-			 * at least 2^30 cycles until the next interrupt
-			 * (assuming each event counts at most 2 counts
-			 * per cycle).
-			 */
-			val = 0;
-			left = ~0ULL >> 1;
-		}
+		if (perf_event_overflow(event, nmi, &data, regs))
+			fsl_emb_pmu_stop(event, 0);
 	}
-
-	write_pmc(event->hw.idx, val);
-	local64_set(&event->hw.prev_count, val);
-	local64_set(&event->hw.period_left, left);
-	perf_event_update_userpage(event);
 }
 
 static void perf_event_interrupt(struct pt_regs *regs)
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 4bbe19058a5..cf39c487346 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -206,26 +206,52 @@ again:
 	local64_add(delta, &event->count);
 }
 
-static void sh_pmu_disable(struct perf_event *event)
+static void sh_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
-	clear_bit(idx, cpuc->active_mask);
-	sh_pmu->disable(hwc, idx);
+	if (!(event->hw.state & PERF_HES_STOPPED)) {
+		sh_pmu->disable(hwc, idx);
+		cpuc->events[idx] = NULL;
+		event->hw.state |= PERF_HES_STOPPED;
+	}
 
-	barrier();
+	if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
+		sh_perf_event_update(event, &event->hw, idx);
+		event->hw.state |= PERF_HES_UPTODATE;
+	}
+}
 
-	sh_perf_event_update(event, &event->hw, idx);
+static void sh_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
 
-	cpuc->events[idx] = NULL;
-	clear_bit(idx, cpuc->used_mask);
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+	cpuc->events[idx] = event;
+	event->hw.state = 0;
+	sh_pmu->enable(hwc, idx);
+}
+
+static void sh_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	sh_pmu_stop(event, PERF_EF_UPDATE);
+	__clear_bit(event->hw.idx, cpuc->used_mask);
 
 	perf_event_update_userpage(event);
 }
 
-static int sh_pmu_enable(struct perf_event *event)
+static int sh_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
@@ -234,21 +260,20 @@ static int sh_pmu_enable(struct perf_event *event)
 
 	perf_pmu_disable(event->pmu);
 
-	if (test_and_set_bit(idx, cpuc->used_mask)) {
+	if (__test_and_set_bit(idx, cpuc->used_mask)) {
 		idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
 		if (idx == sh_pmu->num_events)
 			goto out;
 
-		set_bit(idx, cpuc->used_mask);
+		__set_bit(idx, cpuc->used_mask);
 		hwc->idx = idx;
 	}
 
 	sh_pmu->disable(hwc, idx);
 
-	cpuc->events[idx] = event;
-	set_bit(idx, cpuc->active_mask);
-
-	sh_pmu->enable(hwc, idx);
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (flags & PERF_EF_START)
+		sh_pmu_start(event, PERF_EF_RELOAD);
 
 	perf_event_update_userpage(event);
 	ret = 0;
@@ -285,7 +310,7 @@ static int sh_pmu_event_init(struct perf_event *event)
 	return err;
 }
 
-static void sh_pmu_pmu_enable(struct pmu *pmu)
+static void sh_pmu_enable(struct pmu *pmu)
 {
 	if (!sh_pmu_initialized())
 		return;
@@ -293,7 +318,7 @@ static void sh_pmu_pmu_enable(struct pmu *pmu)
 	sh_pmu->enable_all();
 }
 
-static void sh_pmu_pmu_disable(struct pmu *pmu)
+static void sh_pmu_disable(struct pmu *pmu)
 {
 	if (!sh_pmu_initialized())
 		return;
@@ -302,11 +327,13 @@ static void sh_pmu_pmu_disable(struct pmu *pmu)
 }
 
 static struct pmu pmu = {
-	.pmu_enable	= sh_pmu_pmu_enable,
-	.pmu_disable	= sh_pmu_pmu_disable,
+	.pmu_enable	= sh_pmu_enable,
+	.pmu_disable	= sh_pmu_disable,
 	.event_init	= sh_pmu_event_init,
-	.enable		= sh_pmu_enable,
-	.disable	= sh_pmu_disable,
+	.add		= sh_pmu_add,
+	.del		= sh_pmu_del,
+	.start		= sh_pmu_start,
+	.stop		= sh_pmu_stop,
 	.read		= sh_pmu_read,
 };
 
@@ -334,15 +361,15 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
+int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
 {
 	if (sh_pmu)
 		return -EBUSY;
-	sh_pmu = pmu;
+	sh_pmu = _pmu;
 
-	pr_info("Performance Events: %s support registered\n", pmu->name);
+	pr_info("Performance Events: %s support registered\n", _pmu->name);
 
-	WARN_ON(pmu->num_events > MAX_HWEVENTS);
+	WARN_ON(_pmu->num_events > MAX_HWEVENTS);
 
 	perf_pmu_register(&pmu);
 	perf_cpu_notifier(sh_pmu_notifier);
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 37cae676536..516be2314b5 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -658,13 +658,16 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
 
 		enc = perf_event_get_enc(cpuc->events[i]);
 		pcr &= ~mask_for_index(idx);
-		pcr |= event_encoding(enc, idx);
+		if (hwc->state & PERF_HES_STOPPED)
+			pcr |= nop_for_index(idx);
+		else
+			pcr |= event_encoding(enc, idx);
 	}
 out:
 	return pcr;
 }
 
-static void sparc_pmu_pmu_enable(struct pmu *pmu)
+static void sparc_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	u64 pcr;
@@ -691,7 +694,7 @@ static void sparc_pmu_pmu_enable(struct pmu *pmu)
 	pcr_ops->write(cpuc->pcr);
 }
 
-static void sparc_pmu_pmu_disable(struct pmu *pmu)
+static void sparc_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	u64 val;
@@ -710,10 +713,53 @@ static void sparc_pmu_pmu_disable(struct pmu *pmu)
 	pcr_ops->write(cpuc->pcr);
 }
 
-static void sparc_pmu_disable(struct perf_event *event)
+static int active_event_index(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	int i;
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		if (cpuc->event[i] == event)
+			break;
+	}
+	BUG_ON(i == cpuc->n_events);
+	return cpuc->current_idx[i];
+}
+
+static void sparc_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx = active_event_index(cpuc, event);
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		sparc_perf_event_set_period(event, &event->hw, idx);
+	}
+
+	event->hw.state = 0;
+
+	sparc_pmu_enable_event(cpuc, &event->hw, idx);
+}
+
+static void sparc_pmu_stop(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx = active_event_index(cpuc, event);
+
+	if (!(event->hw.state & PERF_HES_STOPPED)) {
+		sparc_pmu_disable_event(cpuc, &event->hw, idx);
+		event->hw.state |= PERF_HES_STOPPED;
+	}
+
+	if (!(event->hw.state & PERF_HES_UPTODATE) && (flags & PERF_EF_UPDATE)) {
+		sparc_perf_event_update(event, &event->hw, idx);
+		event->hw.state |= PERF_HES_UPTODATE;
+	}
+}
+
+static void sparc_pmu_del(struct perf_event *event, int _flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 	int i;
 
@@ -722,7 +768,10 @@ static void sparc_pmu_disable(struct perf_event *event)
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event[i]) {
-			int idx = cpuc->current_idx[i];
+			/* Absorb the final count and turn off the
+			 * event.
+			 */
+			sparc_pmu_stop(event, PERF_EF_UPDATE);
 
 			/* Shift remaining entries down into
 			 * the existing slot.
@@ -734,13 +783,6 @@ static void sparc_pmu_disable(struct perf_event *event)
 					cpuc->current_idx[i];
 			}
 
-			/* Absorb the final count and turn off the
-			 * event.
-			 */
-			sparc_pmu_disable_event(cpuc, hwc, idx);
-			barrier();
-			sparc_perf_event_update(event, hwc, idx);
-
 			perf_event_update_userpage(event);
 
 			cpuc->n_events--;
@@ -752,19 +794,6 @@ static void sparc_pmu_disable(struct perf_event *event)
 	local_irq_restore(flags);
 }
 
-static int active_event_index(struct cpu_hw_events *cpuc,
-			      struct perf_event *event)
-{
-	int i;
-
-	for (i = 0; i < cpuc->n_events; i++) {
-		if (cpuc->event[i] == event)
-			break;
-	}
-	BUG_ON(i == cpuc->n_events);
-	return cpuc->current_idx[i];
-}
-
 static void sparc_pmu_read(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -774,15 +803,6 @@ static void sparc_pmu_read(struct perf_event *event)
 	sparc_perf_event_update(event, hwc, idx);
 }
 
-static void sparc_pmu_unthrottle(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	int idx = active_event_index(cpuc, event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	sparc_pmu_enable_event(cpuc, hwc, idx);
-}
-
 static atomic_t active_events = ATOMIC_INIT(0);
 static DEFINE_MUTEX(pmc_grab_mutex);
 
@@ -984,7 +1004,7 @@ static int collect_events(struct perf_event *group, int max_count,
 	return n;
 }
 
-static int sparc_pmu_enable(struct perf_event *event)
+static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int n0, ret = -EAGAIN;
@@ -1001,6 +1021,10 @@ static int sparc_pmu_enable(struct perf_event *event)
 	cpuc->events[n0] = event->hw.event_base;
 	cpuc->current_idx[n0] = PIC_NO_INDEX;
 
+	event->hw.state = PERF_HES_UPTODATE;
+	if (!(ef_flags & PERF_EF_START))
+		event->hw.state |= PERF_HES_STOPPED;
+
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
@@ -1156,13 +1180,14 @@ static int sparc_pmu_commit_txn(struct pmu *pmu)
 }
 
 static struct pmu pmu = {
-	.pmu_enable	= sparc_pmu_pmu_enable,
-	.pmu_disable	= sparc_pmu_pmu_disable,
+	.pmu_enable	= sparc_pmu_enable,
+	.pmu_disable	= sparc_pmu_disable,
 	.event_init	= sparc_pmu_event_init,
-	.enable		= sparc_pmu_enable,
-	.disable	= sparc_pmu_disable,
+	.add		= sparc_pmu_add,
+	.del		= sparc_pmu_del,
+	.start		= sparc_pmu_start,
+	.stop		= sparc_pmu_stop,
 	.read		= sparc_pmu_read,
-	.unthrottle	= sparc_pmu_unthrottle,
 	.start_txn	= sparc_pmu_start_txn,
 	.cancel_txn	= sparc_pmu_cancel_txn,
 	.commit_txn	= sparc_pmu_commit_txn,
@@ -1243,7 +1268,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 			continue;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			sparc_pmu_disable_event(cpuc, hwc, idx);
+			sparc_pmu_stop(event, 0);
 	}
 
 	return NOTIFY_STOP;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79705ac4501..dd6fec71067 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -583,7 +583,7 @@ static void x86_pmu_disable_all(void)
 	}
 }
 
-static void x86_pmu_pmu_disable(struct pmu *pmu)
+static void x86_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -800,10 +800,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
 		hwc->last_tag == cpuc->tags[i];
 }
 
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
 
-static void x86_pmu_pmu_enable(struct pmu *pmu)
+static void x86_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
@@ -839,7 +839,14 @@ static void x86_pmu_pmu_enable(struct pmu *pmu)
 			    match_prev_assignment(hwc, cpuc, i))
 				continue;
 
-			x86_pmu_stop(event);
+			/*
+			 * Ensure we don't accidentally enable a stopped
+			 * counter simply because we rescheduled.
+			 */
+			if (hwc->state & PERF_HES_STOPPED)
+				hwc->state |= PERF_HES_ARCH;
+
+			x86_pmu_stop(event, PERF_EF_UPDATE);
 		}
 
 		for (i = 0; i < cpuc->n_events; i++) {
@@ -851,7 +858,10 @@ static void x86_pmu_pmu_enable(struct pmu *pmu)
 			else if (i < n_running)
 				continue;
 
-			x86_pmu_start(event);
+			if (hwc->state & PERF_HES_ARCH)
+				continue;
+
+			x86_pmu_start(event, PERF_EF_RELOAD);
 		}
 		cpuc->n_added = 0;
 		perf_events_lapic_init();
@@ -952,15 +962,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
 }
 
 /*
- * activate a single event
+ * Add a single event to the PMU.
  *
  * The event is added to the group of enabled events
  * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
  */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc;
@@ -975,10 +982,14 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (ret < 0)
 		goto out;
 
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
-	 * at commit time(->commit_txn) as a whole
+	 * at commit time (->commit_txn) as a whole
 	 */
 	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto done_collect;
@@ -1003,27 +1014,28 @@ out:
 	return ret;
 }
 
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx = event->hw.idx;
 
-	if (idx == -1)
-		return -EAGAIN;
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		x86_perf_event_set_period(event);
+	}
+
+	event->hw.state = 0;
 
-	x86_perf_event_set_period(event);
 	cpuc->events[idx] = event;
 	__set_bit(idx, cpuc->active_mask);
 	x86_pmu.enable(event);
 	perf_event_update_userpage(event);
-
-	return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
-	int ret = x86_pmu_start(event);
-	WARN_ON_ONCE(ret);
 }
 
 void perf_event_print_debug(void)
@@ -1080,27 +1092,29 @@ void perf_event_print_debug(void)
 	local_irq_restore(flags);
 }
 
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	if (!__test_and_clear_bit(idx, cpuc->active_mask))
-		return;
-
-	x86_pmu.disable(event);
 
-	/*
-	 * Drain the remaining delta count out of a event
-	 * that we are disabling:
-	 */
-	x86_perf_event_update(event);
+	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+		x86_pmu.disable(event);
+		cpuc->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
 
-	cpuc->events[idx] = NULL;
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		x86_perf_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
 }
 
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
@@ -1113,7 +1127,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
-	x86_pmu_stop(event);
+	x86_pmu_stop(event, PERF_EF_UPDATE);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event_list[i]) {
@@ -1165,7 +1179,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 			continue;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			x86_pmu_stop(event);
+			x86_pmu_stop(event, 0);
 	}
 
 	if (handled)
@@ -1605,15 +1619,17 @@ int x86_pmu_event_init(struct perf_event *event)
 }
 
 static struct pmu pmu = {
-	.pmu_enable	= x86_pmu_pmu_enable,
-	.pmu_disable	= x86_pmu_pmu_disable,
+	.pmu_enable	= x86_pmu_enable,
+	.pmu_disable	= x86_pmu_disable,
+
 	.event_init	= x86_pmu_event_init,
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
+
+	.add		= x86_pmu_add,
+	.del		= x86_pmu_del,
 	.start		= x86_pmu_start,
 	.stop		= x86_pmu_stop,
 	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
+
 	.start_txn	= x86_pmu_start_txn,
 	.cancel_txn	= x86_pmu_cancel_txn,
 	.commit_txn	= x86_pmu_commit_txn,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d..82395f2378e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -763,7 +763,7 @@ again:
 		data.period = event->hw.last_period;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			x86_pmu_stop(event);
+			x86_pmu_stop(event, 0);
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311c..9893a2f77b7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -491,7 +491,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
 	if (perf_event_overflow(event, 1, &data, &regs))
-		x86_pmu_stop(event);
+		x86_pmu_stop(event, 0);
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 5f8ad7bec63..8beabb958f6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -252,8 +252,8 @@ DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
-extern int  perf_trace_enable(struct perf_event *event);
-extern void perf_trace_disable(struct perf_event *event);
+extern int  perf_trace_add(struct perf_event *event, int flags);
+extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8cafa15af60..402073c6166 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -538,6 +538,7 @@ struct hw_perf_event {
 		};
 #endif
 	};
+	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
 	u64				last_period;
@@ -549,6 +550,13 @@ struct hw_perf_event {
 #endif
 };
 
+/*
+ * hw_perf_event::state flags
+ */
+#define PERF_HES_STOPPED	0x01 /* the counter is stopped */
+#define PERF_HES_UPTODATE	0x02 /* event->count up-to-date */
+#define PERF_HES_ARCH		0x04
+
 struct perf_event;
 
 /*
@@ -564,42 +572,62 @@ struct pmu {
 
 	int				*pmu_disable_count;
 
+	/*
+	 * Fully disable/enable this PMU, can be used to protect from the PMI
+	 * as well as for lazy/batch writing of the MSRs.
+	 */
 	void (*pmu_enable)		(struct pmu *pmu); /* optional */
 	void (*pmu_disable)		(struct pmu *pmu); /* optional */
 
 	/*
+	 * Try and initialize the event for this PMU.
 	 * Should return -ENOENT when the @event doesn't match this PMU.
 	 */
 	int (*event_init)		(struct perf_event *event);
 
-	int  (*enable)			(struct perf_event *event);
-	void (*disable)			(struct perf_event *event);
-	int  (*start)			(struct perf_event *event);
-	void (*stop)			(struct perf_event *event);
+#define PERF_EF_START	0x01		/* start the counter when adding    */
+#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
+#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */
+
+	/*
+	 * Adds/Removes a counter to/from the PMU, can be done inside
+	 * a transaction, see the ->*_txn() methods.
+	 */
+	int  (*add)			(struct perf_event *event, int flags);
+	void (*del)			(struct perf_event *event, int flags);
+
+	/*
+	 * Starts/Stops a counter present on the PMU. The PMI handler
+	 * should stop the counter when perf_event_overflow() returns
+	 * !0. ->start() will be used to continue.
+	 */
+	void (*start)			(struct perf_event *event, int flags);
+	void (*stop)			(struct perf_event *event, int flags);
+
+	/*
+	 * Updates the counter value of the event.
+	 */
 	void (*read)			(struct perf_event *event);
-	void (*unthrottle)		(struct perf_event *event);
 
 	/*
 	 * Group events scheduling is treated as a transaction, add
 	 * group events as a whole and perform one schedulability test.
 	 * If the test fails, roll back the whole group
-	 */
-
-	/*
-	 * Start the transaction, after this ->enable() doesn't need to
+	 *
+	 * Start the transaction, after this ->add() doesn't need to
 	 * do schedulability tests.
 	 */
 	void (*start_txn)	(struct pmu *pmu); /* optional */
 	/*
-	 * If ->start_txn() disabled the ->enable() schedulability test
+	 * If ->start_txn() disabled the ->add() schedulability test
 	 * then ->commit_txn() is required to perform one. On success
 	 * the transaction is closed. On error the transaction is kept
 	 * open until ->cancel_txn() is called.
 	 */
 	int  (*commit_txn)	(struct pmu *pmu); /* optional */
 	/*
-	 * Will cancel the transaction, assumes ->disable() is called
-	 * for each successfull ->enable() during the transaction.
+	 * Will cancel the transaction, assumes ->del() is called
+	 * for each successfull ->add() during the transaction.
 	 */
 	void (*cancel_txn)	(struct pmu *pmu); /* optional */
 };
@@ -680,7 +708,7 @@ struct perf_event {
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
-	struct pmu		*pmu;
+	struct pmu			*pmu;
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e9c5cfa1fd2..6f150095caf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -586,10 +586,35 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
 	return 0;
 }
 
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+	if (!(flags & PERF_EF_START))
+		bp->hw.state = PERF_HES_STOPPED;
+
+	return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+	arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+	bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+	bp->hw.state = PERF_HES_STOPPED;
+}
+
 static struct pmu perf_breakpoint = {
 	.event_init	= hw_breakpoint_event_init,
-	.enable		= arch_install_hw_breakpoint,
-	.disable	= arch_uninstall_hw_breakpoint,
+	.add		= hw_breakpoint_add,
+	.del		= hw_breakpoint_del,
+	.start		= hw_breakpoint_start,
+	.stop		= hw_breakpoint_stop,
 	.read		= hw_breakpoint_pmu_read,
 };
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1a6cdbf0d09..3bace4fd035 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -424,7 +424,7 @@ event_sched_out(struct perf_event *event,
 		event->state = PERF_EVENT_STATE_OFF;
 	}
 	event->tstamp_stopped = ctx->time;
-	event->pmu->disable(event);
+	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
 	if (!is_software_event(event))
@@ -649,7 +649,7 @@ event_sched_in(struct perf_event *event,
 	 */
 	smp_wmb();
 
-	if (event->pmu->enable(event)) {
+	if (event->pmu->add(event, PERF_EF_START)) {
 		event->state = PERF_EVENT_STATE_INACTIVE;
 		event->oncpu = -1;
 		return -EAGAIN;
@@ -1482,22 +1482,6 @@ do {					\
 	return div64_u64(dividend, divisor);
 }
 
-static void perf_event_stop(struct perf_event *event)
-{
-	if (!event->pmu->stop)
-		return event->pmu->disable(event);
-
-	return event->pmu->stop(event);
-}
-
-static int perf_event_start(struct perf_event *event)
-{
-	if (!event->pmu->start)
-		return event->pmu->enable(event);
-
-	return event->pmu->start(event);
-}
-
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -1517,9 +1501,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	hwc->sample_period = sample_period;
 
 	if (local64_read(&hwc->period_left) > 8*sample_period) {
-		perf_event_stop(event);
+		event->pmu->stop(event, PERF_EF_UPDATE);
 		local64_set(&hwc->period_left, 0);
-		perf_event_start(event);
+		event->pmu->start(event, PERF_EF_RELOAD);
 	}
 }
 
@@ -1548,7 +1532,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(event, 1);
-			event->pmu->unthrottle(event);
+			event->pmu->start(event, 0);
 		}
 
 		if (!event->attr.freq || !event->attr.sample_freq)
@@ -2506,6 +2490,9 @@ int perf_event_task_disable(void)
 
 static int perf_event_index(struct perf_event *event)
 {
+	if (event->hw.state & PERF_HES_STOPPED)
+		return 0;
+
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return 0;
 
@@ -4120,8 +4107,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 	struct hw_perf_event *hwc = &event->hw;
 	int ret = 0;
 
-	throttle = (throttle && event->pmu->unthrottle != NULL);
-
 	if (!throttle) {
 		hwc->interrupts++;
 	} else {
@@ -4246,7 +4231,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 	}
 }
 
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
 			       int nmi, struct perf_sample_data *data,
 			       struct pt_regs *regs)
 {
@@ -4272,6 +4257,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 static int perf_exclude_event(struct perf_event *event,
 			      struct pt_regs *regs)
 {
+	if (event->hw.state & PERF_HES_STOPPED)
+		return 0;
+
 	if (regs) {
 		if (event->attr.exclude_user && user_mode(regs))
 			return 1;
@@ -4371,7 +4359,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
-			perf_swevent_add(event, nr, nmi, data, regs);
+			perf_swevent_event(event, nr, nmi, data, regs);
 	}
 end:
 	rcu_read_unlock();
@@ -4415,7 +4403,7 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
 
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_cpu_context *cpuctx;
@@ -4428,6 +4416,8 @@ static int perf_swevent_enable(struct perf_event *event)
 		perf_swevent_set_period(event);
 	}
 
+	hwc->state = !(flags & PERF_EF_START);
+
 	head = find_swevent_head(cpuctx, event);
 	if (WARN_ON_ONCE(!head))
 		return -EINVAL;
@@ -4437,18 +4427,19 @@ static int perf_swevent_enable(struct perf_event *event)
 	return 0;
 }
 
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
 {
 	hlist_del_rcu(&event->hlist_entry);
 }
 
-static void perf_swevent_void(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
 {
+	event->hw.state = 0;
 }
 
-static int perf_swevent_int(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
 {
-	return 0;
+	event->hw.state = PERF_HES_STOPPED;
 }
 
 /* Deref the hlist from the update side */
@@ -4604,12 +4595,11 @@ static int perf_swevent_init(struct perf_event *event)
 
 static struct pmu perf_swevent = {
 	.event_init	= perf_swevent_init,
-	.enable		= perf_swevent_enable,
-	.disable	= perf_swevent_disable,
-	.start		= perf_swevent_int,
-	.stop		= perf_swevent_void,
+	.add		= perf_swevent_add,
+	.del		= perf_swevent_del,
+	.start		= perf_swevent_start,
+	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_void, /* hwc->interrupts already reset */
 };
 
 #ifdef CONFIG_EVENT_TRACING
@@ -4657,7 +4647,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 
 	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
-			perf_swevent_add(event, count, 1, &data, regs);
+			perf_swevent_event(event, count, 1, &data, regs);
 	}
 
 	perf_swevent_put_recursion_context(rctx);
@@ -4696,12 +4686,11 @@ static int perf_tp_event_init(struct perf_event *event)
 
 static struct pmu perf_tracepoint = {
 	.event_init	= perf_tp_event_init,
-	.enable		= perf_trace_enable,
-	.disable	= perf_trace_disable,
-	.start		= perf_swevent_int,
-	.stop		= perf_swevent_void,
+	.add		= perf_trace_add,
+	.del		= perf_trace_del,
+	.start		= perf_swevent_start,
+	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
-	.unthrottle	= perf_swevent_void,
 };
 
 static inline void perf_tp_register(void)
@@ -4757,8 +4746,8 @@ void perf_bp_event(struct perf_event *bp, void *data)
 
 	perf_sample_data_init(&sample, bp->attr.bp_addr);
 
-	if (!perf_exclude_event(bp, regs))
-		perf_swevent_add(bp, 1, 1, &sample, regs);
+	if (!bp->hw.state && !perf_exclude_event(bp, regs))
+		perf_swevent_event(bp, 1, 1, &sample, regs);
 }
 #endif
 
@@ -4834,32 +4823,39 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 
 static void cpu_clock_event_update(struct perf_event *event)
 {
-	int cpu = raw_smp_processor_id();
 	s64 prev;
 	u64 now;
 
-	now = cpu_clock(cpu);
+	now = local_clock();
 	prev = local64_xchg(&event->hw.prev_count, now);
 	local64_add(now - prev, &event->count);
 }
 
-static int cpu_clock_event_enable(struct perf_event *event)
+static void cpu_clock_event_start(struct perf_event *event, int flags)
 {
-	struct hw_perf_event *hwc = &event->hw;
-	int cpu = raw_smp_processor_id();
-
-	local64_set(&hwc->prev_count, cpu_clock(cpu));
+	local64_set(&event->hw.prev_count, local_clock());
 	perf_swevent_start_hrtimer(event);
-
-	return 0;
 }
 
-static void cpu_clock_event_disable(struct perf_event *event)
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
 {
 	perf_swevent_cancel_hrtimer(event);
 	cpu_clock_event_update(event);
 }
 
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		cpu_clock_event_start(event, flags);
+
+	return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+	cpu_clock_event_stop(event, flags);
+}
+
 static void cpu_clock_event_read(struct perf_event *event)
 {
 	cpu_clock_event_update(event);
@@ -4878,8 +4874,10 @@ static int cpu_clock_event_init(struct perf_event *event)
 
 static struct pmu perf_cpu_clock = {
 	.event_init	= cpu_clock_event_init,
-	.enable		= cpu_clock_event_enable,
-	.disable	= cpu_clock_event_disable,
+	.add		= cpu_clock_event_add,
+	.del		= cpu_clock_event_del,
+	.start		= cpu_clock_event_start,
+	.stop		= cpu_clock_event_stop,
 	.read		= cpu_clock_event_read,
 };
 
@@ -4897,25 +4895,29 @@ static void task_clock_event_update(struct perf_event *event, u64 now)
 	local64_add(delta, &event->count);
 }
 
-static int task_clock_event_enable(struct perf_event *event)
+static void task_clock_event_start(struct perf_event *event, int flags)
 {
-	struct hw_perf_event *hwc = &event->hw;
-	u64 now;
-
-	now = event->ctx->time;
-
-	local64_set(&hwc->prev_count, now);
-
+	local64_set(&event->hw.prev_count, event->ctx->time);
 	perf_swevent_start_hrtimer(event);
-
-	return 0;
 }
 
-static void task_clock_event_disable(struct perf_event *event)
+static void task_clock_event_stop(struct perf_event *event, int flags)
 {
 	perf_swevent_cancel_hrtimer(event);
 	task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		task_clock_event_start(event, flags);
 
+	return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+	task_clock_event_stop(event, PERF_EF_UPDATE);
 }
 
 static void task_clock_event_read(struct perf_event *event)
@@ -4947,8 +4949,10 @@ static int task_clock_event_init(struct perf_event *event)
 
 static struct pmu perf_task_clock = {
 	.event_init	= task_clock_event_init,
-	.enable		= task_clock_event_enable,
-	.disable	= task_clock_event_disable,
+	.add		= task_clock_event_add,
+	.del		= task_clock_event_del,
+	.start		= task_clock_event_start,
+	.stop		= task_clock_event_stop,
 	.read		= task_clock_event_read,
 };
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index f3bbcd1c90c..39c059ca670 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -101,7 +101,7 @@ int perf_trace_init(struct perf_event *p_event)
 	return ret;
 }
 
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct ftrace_event_call *tp_event = p_event->tp_event;
 	struct hlist_head __percpu *pcpu_list;
@@ -111,13 +111,16 @@ int perf_trace_enable(struct perf_event *p_event)
 	if (WARN_ON_ONCE(!pcpu_list))
 		return -EINVAL;
 
+	if (!(flags & PERF_EF_START))
+		p_event->hw.state = PERF_HES_STOPPED;
+
 	list = this_cpu_ptr(pcpu_list);
 	hlist_add_head_rcu(&p_event->hlist_entry, list);
 
 	return 0;
 }
 
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	hlist_del_rcu(&p_event->hlist_entry);
 }
-- 
cgit v1.2.3-70-g09d2


From 15ac9a395a753cb28c674e7ea80386ffdff21785 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Sep 2010 15:51:45 +0200
Subject: perf: Remove the sysfs bits

Neither the overcommit nor the reservation sysfs parameter were
actually working, remove them as they'll only get in the way.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/perf_event.c   |   3 +-
 arch/arm/kernel/perf_event.c     |   9 +--
 arch/sparc/kernel/perf_event.c   |   9 +--
 arch/x86/kernel/cpu/perf_event.c |   1 -
 include/linux/perf_event.h       |   6 --
 kernel/perf_event.c              | 124 ---------------------------------------
 6 files changed, 5 insertions(+), 147 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index 380ef02d557..9bb8c024080 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -808,7 +808,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 	wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
 
 	/* la_ptr is the counter that overflowed. */
-	if (unlikely(la_ptr >= perf_max_events)) {
+	if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) {
 		/* This should never occur! */
 		irq_err_count++;
 		pr_warning("PMI: silly index %ld\n", la_ptr);
@@ -879,7 +879,6 @@ void __init init_hw_perf_events(void)
 
 	/* And set up PMU specification */
 	alpha_pmu = &ev67_pmu;
-	perf_max_events = alpha_pmu->num_pmcs;
 
 	perf_pmu_register(&pmu);
 }
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 448cfa6b3ef..45d6a35217c 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -534,7 +534,7 @@ static int armpmu_event_init(struct perf_event *event)
 	event->destroy = hw_perf_event_destroy;
 
 	if (!atomic_inc_not_zero(&active_events)) {
-		if (atomic_read(&active_events) > perf_max_events) {
+		if (atomic_read(&active_events) > armpmu.num_events) {
 			atomic_dec(&active_events);
 			return -ENOSPC;
 		}
@@ -2974,14 +2974,12 @@ init_hw_perf_events(void)
 			armpmu = &armv6pmu;
 			memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
 					sizeof(armv6_perf_cache_map));
-			perf_max_events	= armv6pmu.num_events;
 			break;
 		case 0xB020:	/* ARM11mpcore */
 			armpmu = &armv6mpcore_pmu;
 			memcpy(armpmu_perf_cache_map,
 			       armv6mpcore_perf_cache_map,
 			       sizeof(armv6mpcore_perf_cache_map));
-			perf_max_events = armv6mpcore_pmu.num_events;
 			break;
 		case 0xC080:	/* Cortex-A8 */
 			armv7pmu.id = ARM_PERF_PMU_ID_CA8;
@@ -2993,7 +2991,6 @@ init_hw_perf_events(void)
 			/* Reset PMNC and read the nb of CNTx counters
 			    supported */
 			armv7pmu.num_events = armv7_reset_read_pmnc();
-			perf_max_events = armv7pmu.num_events;
 			break;
 		case 0xC090:	/* Cortex-A9 */
 			armv7pmu.id = ARM_PERF_PMU_ID_CA9;
@@ -3005,7 +3002,6 @@ init_hw_perf_events(void)
 			/* Reset PMNC and read the nb of CNTx counters
 			    supported */
 			armv7pmu.num_events = armv7_reset_read_pmnc();
-			perf_max_events = armv7pmu.num_events;
 			break;
 		}
 	/* Intel CPUs [xscale]. */
@@ -3016,13 +3012,11 @@ init_hw_perf_events(void)
 			armpmu = &xscale1pmu;
 			memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
 					sizeof(xscale_perf_cache_map));
-			perf_max_events	= xscale1pmu.num_events;
 			break;
 		case 2:
 			armpmu = &xscale2pmu;
 			memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
 					sizeof(xscale_perf_cache_map));
-			perf_max_events	= xscale2pmu.num_events;
 			break;
 		}
 	}
@@ -3032,7 +3026,6 @@ init_hw_perf_events(void)
 				arm_pmu_names[armpmu->id], armpmu->num_events);
 	} else {
 		pr_info("no hardware support available\n");
-		perf_max_events = -1;
 	}
 
 	perf_pmu_register(&pmu);
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 516be2314b5..f9a70675936 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -897,7 +897,7 @@ static int sparc_check_constraints(struct perf_event **evts,
 	if (!n_ev)
 		return 0;
 
-	if (n_ev > perf_max_events)
+	if (n_ev > MAX_HWEVENTS)
 		return -1;
 
 	msk0 = perf_event_get_msk(events[0]);
@@ -1014,7 +1014,7 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 	perf_pmu_disable(event->pmu);
 
 	n0 = cpuc->n_events;
-	if (n0 >= perf_max_events)
+	if (n0 >= MAX_HWEVENTS)
 		goto out;
 
 	cpuc->event[n0] = event;
@@ -1097,7 +1097,7 @@ static int sparc_pmu_event_init(struct perf_event *event)
 	n = 0;
 	if (event->group_leader != event) {
 		n = collect_events(event->group_leader,
-				   perf_max_events - 1,
+				   MAX_HWEVENTS - 1,
 				   evts, events, current_idx_dmy);
 		if (n < 0)
 			return -EINVAL;
@@ -1309,9 +1309,6 @@ void __init init_hw_perf_events(void)
 
 	pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
 
-	/* All sparc64 PMUs currently have 2 events.  */
-	perf_max_events = 2;
-
 	perf_pmu_register(&pmu);
 	register_die_notifier(&perf_event_nmi_notifier);
 }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index dd6fec71067..0fb17050360 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1396,7 +1396,6 @@ void __init init_hw_perf_events(void)
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 	}
 	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-	perf_max_events = x86_pmu.num_counters;
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 402073c6166..b22176d3ebd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -860,7 +860,6 @@ struct perf_cpu_context {
 	struct perf_event_context	ctx;
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
-	int				max_pertask;
 	int				exclusive;
 	struct swevent_hlist		*swevent_hlist;
 	struct mutex			hlist_mutex;
@@ -883,11 +882,6 @@ struct perf_output_handle {
 
 #ifdef CONFIG_PERF_EVENTS
 
-/*
- * Set by architecture code:
- */
-extern int perf_max_events;
-
 extern int perf_pmu_register(struct pmu *pmu);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3bace4fd035..8462e69409a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -39,10 +39,6 @@
  */
 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
 static atomic_t nr_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -66,11 +62,6 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_event_id;
 
-/*
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
-
 void __weak perf_event_print_debug(void)	{ }
 
 void perf_pmu_disable(struct pmu *pmu)
@@ -480,16 +471,6 @@ static void __perf_event_remove_from_context(void *info)
 
 	list_del_event(event, ctx);
 
-	if (!ctx->task) {
-		/*
-		 * Allow more per task events with respect to the
-		 * reservation:
-		 */
-		cpuctx->max_pertask =
-			min(perf_max_events - ctx->nr_events,
-			    perf_max_events - perf_reserved_percpu);
-	}
-
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -823,9 +804,6 @@ static void __perf_install_in_context(void *info)
 		}
 	}
 
-	if (!err && !ctx->task && cpuctx->max_pertask)
-		cpuctx->max_pertask--;
-
 unlock:
 	raw_spin_unlock(&ctx->lock);
 }
@@ -5930,10 +5908,6 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 
-	spin_lock(&perf_resource_lock);
-	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
-	spin_unlock(&perf_resource_lock);
-
 	mutex_lock(&cpuctx->hlist_mutex);
 	if (cpuctx->hlist_refcount > 0) {
 		struct swevent_hlist *hlist;
@@ -6008,101 +5982,3 @@ void __init perf_event_init(void)
 	perf_tp_register();
 	perf_cpu_notifier(perf_cpu_notify);
 }
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-					struct sysdev_class_attribute *attr,
-					char *buf)
-{
-	return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-			struct sysdev_class_attribute *attr,
-			const char *buf,
-			size_t count)
-{
-	struct perf_cpu_context *cpuctx;
-	unsigned long val;
-	int err, cpu, mpt;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > perf_max_events)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_reserved_percpu = val;
-	for_each_online_cpu(cpu) {
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		raw_spin_lock_irq(&cpuctx->ctx.lock);
-		mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-			  perf_max_events - perf_reserved_percpu);
-		cpuctx->max_pertask = mpt;
-		raw_spin_unlock_irq(&cpuctx->ctx.lock);
-	}
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
-				    struct sysdev_class_attribute *attr,
-				    char *buf)
-{
-	return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class,
-		    struct sysdev_class_attribute *attr,
-		    const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	err = strict_strtoul(buf, 10, &val);
-	if (err)
-		return err;
-	if (val > 1)
-		return -EINVAL;
-
-	spin_lock(&perf_resource_lock);
-	perf_overcommit = val;
-	spin_unlock(&perf_resource_lock);
-
-	return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-				reserve_percpu,
-				0644,
-				perf_show_reserve_percpu,
-				perf_set_reserve_percpu
-			);
-
-static SYSDEV_CLASS_ATTR(
-				overcommit,
-				0644,
-				perf_show_overcommit,
-				perf_set_overcommit
-			);
-
-static struct attribute *perfclass_attrs[] = {
-	&attr_reserve_percpu.attr,
-	&attr_overcommit.attr,
-	NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-	.attrs			= perfclass_attrs,
-	.name			= "perf_events",
-};
-
-static int __init perf_event_sysfs_init(void)
-{
-	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-				  &perfclass_attr_group);
-}
-device_initcall(perf_event_sysfs_init);
-- 
cgit v1.2.3-70-g09d2


From c3f00c70276d8ae82578c8b773e2db657f69a478 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 18 Aug 2010 14:37:15 +0200
Subject: perf: Separate find_get_context() from event initialization

Separate find_get_context() from the event allocation and
initialization so that we may make find_get_context() depend
on the event pmu in a later patch.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 73 +++++++++++++++++++++++++----------------------------
 1 file changed, 35 insertions(+), 38 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8462e69409a..a3c86a8335c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -827,6 +827,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
+	event->ctx = ctx;
+
 	if (!task) {
 		/*
 		 * Per cpu events are installed via an smp call and
@@ -5038,20 +5040,17 @@ struct pmu *perf_init_event(struct perf_event *event)
  * Allocate and initialize a event structure
  */
 static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
-		   int cpu,
-		   struct perf_event_context *ctx,
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		   struct perf_event *group_leader,
 		   struct perf_event *parent_event,
-		   perf_overflow_handler_t overflow_handler,
-		   gfp_t gfpflags)
+		   perf_overflow_handler_t overflow_handler)
 {
 	struct pmu *pmu;
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
 	long err;
 
-	event = kzalloc(sizeof(*event), gfpflags);
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
 		return ERR_PTR(-ENOMEM);
 
@@ -5076,7 +5075,6 @@ perf_event_alloc(struct perf_event_attr *attr,
 	event->attr		= *attr;
 	event->group_leader	= group_leader;
 	event->pmu		= NULL;
-	event->ctx		= ctx;
 	event->oncpu		= -1;
 
 	event->parent		= parent_event;
@@ -5321,20 +5319,26 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (event_fd < 0)
 		return event_fd;
 
+	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
+	if (IS_ERR(event)) {
+		err = PTR_ERR(event);
+		goto err_fd;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
-		goto err_fd;
+		goto err_alloc;
 	}
 
 	if (group_fd != -1) {
 		group_leader = perf_fget_light(group_fd, &fput_needed);
 		if (IS_ERR(group_leader)) {
 			err = PTR_ERR(group_leader);
-			goto err_put_context;
+			goto err_context;
 		}
 		group_file = group_leader->filp;
 		if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5354,37 +5358,30 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * becoming part of another group-sibling):
 		 */
 		if (group_leader->group_leader != group_leader)
-			goto err_put_context;
+			goto err_context;
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
 		 */
 		if (group_leader->ctx != ctx)
-			goto err_put_context;
+			goto err_context;
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
 		if (attr.exclusive || attr.pinned)
-			goto err_put_context;
-	}
-
-	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, NULL, GFP_KERNEL);
-	if (IS_ERR(event)) {
-		err = PTR_ERR(event);
-		goto err_put_context;
+			goto err_context;
 	}
 
 	if (output_event) {
 		err = perf_event_set_output(event, output_event);
 		if (err)
-			goto err_free_put_context;
+			goto err_context;
 	}
 
 	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
 	if (IS_ERR(event_file)) {
 		err = PTR_ERR(event_file);
-		goto err_free_put_context;
+		goto err_context;
 	}
 
 	event->filp = event_file;
@@ -5410,11 +5407,11 @@ SYSCALL_DEFINE5(perf_event_open,
 	fd_install(event_fd, event_file);
 	return event_fd;
 
-err_free_put_context:
-	free_event(event);
-err_put_context:
+err_context:
 	fput_light(group_file, fput_needed);
 	put_ctx(ctx);
+err_alloc:
+	free_event(event);
 err_fd:
 	put_unused_fd(event_fd);
 	return err;
@@ -5432,25 +5429,24 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 				 pid_t pid,
 				 perf_overflow_handler_t overflow_handler)
 {
-	struct perf_event *event;
 	struct perf_event_context *ctx;
+	struct perf_event *event;
 	int err;
 
 	/*
 	 * Get the target context (task or percpu):
 	 */
 
+	event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
+	if (IS_ERR(event)) {
+		err = PTR_ERR(event);
+		goto err;
+	}
+
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
-		goto err_exit;
-	}
-
-	event = perf_event_alloc(attr, cpu, ctx, NULL,
-				 NULL, overflow_handler, GFP_KERNEL);
-	if (IS_ERR(event)) {
-		err = PTR_ERR(event);
-		goto err_put_context;
+		goto err_free;
 	}
 
 	event->filp = NULL;
@@ -5468,9 +5464,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
- err_put_context:
-	put_ctx(ctx);
- err_exit:
+err_free:
+	free_event(event);
+err:
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
@@ -5498,9 +5494,9 @@ inherit_event(struct perf_event *parent_event,
 		parent_event = parent_event->parent;
 
 	child_event = perf_event_alloc(&parent_event->attr,
-					   parent_event->cpu, child_ctx,
+					   parent_event->cpu,
 					   group_leader, parent_event,
-					   NULL, GFP_KERNEL);
+					   NULL);
 	if (IS_ERR(child_event))
 		return child_event;
 	get_ctx(child_ctx);
@@ -5525,6 +5521,7 @@ inherit_event(struct perf_event *parent_event,
 		local64_set(&hwc->period_left, sample_period);
 	}
 
+	child_event->ctx = child_ctx;
 	child_event->overflow_handler = parent_event->overflow_handler;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From b28ab83c595e767f2028276b7398d17f2253cec0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Sep 2010 14:48:15 +0200
Subject: perf: Remove the swevent hash-table from the cpu context

Separate the swevent hash-table from the cpu_context bits in
preparation for per pmu cpu contexts.

This keeps the swevent hash a global entity.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |   6 ---
 kernel/perf_event.c        | 104 +++++++++++++++++++++++++--------------------
 2 files changed, 58 insertions(+), 52 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b22176d3ebd..4ab4f0ca09a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -861,12 +861,6 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
-	struct swevent_hlist		*swevent_hlist;
-	struct mutex			hlist_mutex;
-	int				hlist_refcount;
-
-	/* Recursion avoidance in each contexts */
-	int				recursion[PERF_NR_CONTEXTS];
 };
 
 struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a3c86a8335c..2c47ed6c4f2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4154,6 +4154,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
  * Generic software event infrastructure
  */
 
+struct swevent_htable {
+	struct swevent_hlist		*swevent_hlist;
+	struct mutex			hlist_mutex;
+	int				hlist_refcount;
+
+	/* Recursion avoidance in each contexts */
+	int				recursion[PERF_NR_CONTEXTS];
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
 /*
  * We directly increment event->count and keep a second value in
  * event->hw.period_left to count intervals. This period event
@@ -4286,11 +4297,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 
 /* For the read side: events when they trigger */
 static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
 {
 	struct swevent_hlist *hlist;
 
-	hlist = rcu_dereference(ctx->swevent_hlist);
+	hlist = rcu_dereference(swhash->swevent_hlist);
 	if (!hlist)
 		return NULL;
 
@@ -4299,7 +4310,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 
 /* For the event head insertion and removal in the hlist */
 static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 {
 	struct swevent_hlist *hlist;
 	u32 event_id = event->attr.config;
@@ -4310,7 +4321,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
 	 * and release. Which makes the protected version suitable here.
 	 * The context lock guarantees that.
 	 */
-	hlist = rcu_dereference_protected(ctx->swevent_hlist,
+	hlist = rcu_dereference_protected(swhash->swevent_hlist,
 					  lockdep_is_held(&event->ctx->lock));
 	if (!hlist)
 		return NULL;
@@ -4323,17 +4334,13 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 				    struct perf_sample_data *data,
 				    struct pt_regs *regs)
 {
-	struct perf_cpu_context *cpuctx;
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 	struct perf_event *event;
 	struct hlist_node *node;
 	struct hlist_head *head;
 
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-
 	rcu_read_lock();
-
-	head = find_swevent_head_rcu(cpuctx, type, event_id);
-
+	head = find_swevent_head_rcu(swhash, type, event_id);
 	if (!head)
 		goto end;
 
@@ -4347,17 +4354,17 @@ end:
 
 int perf_swevent_get_recursion_context(void)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 
-	return get_recursion_context(cpuctx->recursion);
+	return get_recursion_context(swhash->recursion);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
 void inline perf_swevent_put_recursion_context(int rctx)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 
-	put_recursion_context(cpuctx->recursion, rctx);
+	put_recursion_context(swhash->recursion, rctx);
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4385,12 +4392,10 @@ static void perf_swevent_read(struct perf_event *event)
 
 static int perf_swevent_add(struct perf_event *event, int flags)
 {
+	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_cpu_context *cpuctx;
 	struct hlist_head *head;
 
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-
 	if (hwc->sample_period) {
 		hwc->last_period = hwc->sample_period;
 		perf_swevent_set_period(event);
@@ -4398,7 +4403,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
 
 	hwc->state = !(flags & PERF_EF_START);
 
-	head = find_swevent_head(cpuctx, event);
+	head = find_swevent_head(swhash, event);
 	if (WARN_ON_ONCE(!head))
 		return -EINVAL;
 
@@ -4424,10 +4429,10 @@ static void perf_swevent_stop(struct perf_event *event, int flags)
 
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
 {
-	return rcu_dereference_protected(cpuctx->swevent_hlist,
-					 lockdep_is_held(&cpuctx->hlist_mutex));
+	return rcu_dereference_protected(swhash->swevent_hlist,
+					 lockdep_is_held(&swhash->hlist_mutex));
 }
 
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4438,27 +4443,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 	kfree(hlist);
 }
 
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
 {
-	struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+	struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
 
 	if (!hlist)
 		return;
 
-	rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+	rcu_assign_pointer(swhash->swevent_hlist, NULL);
 	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 }
 
 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
-	mutex_lock(&cpuctx->hlist_mutex);
+	mutex_lock(&swhash->hlist_mutex);
 
-	if (!--cpuctx->hlist_refcount)
-		swevent_hlist_release(cpuctx);
+	if (!--swhash->hlist_refcount)
+		swevent_hlist_release(swhash);
 
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_unlock(&swhash->hlist_mutex);
 }
 
 static void swevent_hlist_put(struct perf_event *event)
@@ -4476,12 +4481,12 @@ static void swevent_hlist_put(struct perf_event *event)
 
 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 	int err = 0;
 
-	mutex_lock(&cpuctx->hlist_mutex);
+	mutex_lock(&swhash->hlist_mutex);
 
-	if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4489,11 +4494,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 			err = -ENOMEM;
 			goto exit;
 		}
-		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
-	cpuctx->hlist_refcount++;
+	swhash->hlist_refcount++;
 exit:
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_unlock(&swhash->hlist_mutex);
 
 	return err;
 }
@@ -5889,12 +5894,15 @@ int perf_event_init_task(struct task_struct *child)
 
 static void __init perf_event_init_all_cpus(void)
 {
-	int cpu;
 	struct perf_cpu_context *cpuctx;
+	struct swevent_htable *swhash;
+	int cpu;
 
 	for_each_possible_cpu(cpu) {
+		swhash = &per_cpu(swevent_htable, cpu);
+		mutex_init(&swhash->hlist_mutex);
+
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		mutex_init(&cpuctx->hlist_mutex);
 		__perf_event_init_context(&cpuctx->ctx, NULL);
 	}
 }
@@ -5902,18 +5910,21 @@ static void __init perf_event_init_all_cpus(void)
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
+	struct swevent_htable *swhash;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 
-	mutex_lock(&cpuctx->hlist_mutex);
-	if (cpuctx->hlist_refcount > 0) {
+	swhash = &per_cpu(swevent_htable, cpu);
+
+	mutex_lock(&swhash->hlist_mutex);
+	if (swhash->hlist_refcount > 0) {
 		struct swevent_hlist *hlist;
 
-		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
-		WARN_ON_ONCE(!hlist);
-		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+		WARN_ON(!hlist);
+		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_unlock(&swhash->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5931,11 +5942,12 @@ static void __perf_event_exit_cpu(void *info)
 static void perf_event_exit_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	mutex_lock(&cpuctx->hlist_mutex);
-	swevent_hlist_release(cpuctx);
-	mutex_unlock(&cpuctx->hlist_mutex);
+	mutex_lock(&swhash->hlist_mutex);
+	swevent_hlist_release(swhash);
+	mutex_unlock(&swhash->hlist_mutex);
 
 	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-- 
cgit v1.2.3-70-g09d2


From b5ab4cd563e7ab49b27957704112a8ecade54e1f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Sep 2010 16:32:21 +0200
Subject: perf: Per cpu-context rotation timer

Give each cpu-context its own timer so that it is a self contained
entity, this eases the way for per-pmu-per-cpu contexts as well as
provides the basic infrastructure to allow different rotation
times per pmu.

Things to look at:
 - folding the tick and these TICK_NSEC timers
 - separate task context rotation

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  5 ++-
 kernel/perf_event.c        | 80 ++++++++++++++++++++++++++++++++++++----------
 kernel/sched.c             |  2 --
 3 files changed, 65 insertions(+), 22 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4ab4f0ca09a..fa04537df55 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -861,6 +861,8 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
+	u64				timer_interval;
+	struct hrtimer			timer;
 };
 
 struct perf_output_handle {
@@ -881,7 +883,6 @@ extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern void perf_event_task_sched_in(struct task_struct *task);
 extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
-extern void perf_event_task_tick(struct task_struct *task);
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
@@ -1067,8 +1068,6 @@ perf_event_task_sched_in(struct task_struct *task)			{ }
 static inline void
 perf_event_task_sched_out(struct task_struct *task,
 			    struct task_struct *next)			{ }
-static inline void
-perf_event_task_tick(struct task_struct *task)				{ }
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2c47ed6c4f2..d75e4c8727f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -78,6 +78,25 @@ void perf_pmu_enable(struct pmu *pmu)
 		pmu->pmu_enable(pmu);
 }
 
+static void perf_pmu_rotate_start(void)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	if (hrtimer_active(&cpuctx->timer))
+		return;
+
+	__hrtimer_start_range_ns(&cpuctx->timer,
+			ns_to_ktime(cpuctx->timer_interval), 0,
+			HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void perf_pmu_rotate_stop(void)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	hrtimer_cancel(&cpuctx->timer);
+}
+
 static void get_ctx(struct perf_event_context *ctx)
 {
 	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
@@ -281,6 +300,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	}
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
+	if (!ctx->nr_events)
+		perf_pmu_rotate_start();
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
@@ -1383,6 +1404,12 @@ void perf_event_task_sched_in(struct task_struct *task)
 	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 
 	cpuctx->task_ctx = ctx;
+
+	/*
+	 * Since these rotations are per-cpu, we need to ensure the
+	 * cpu-context we got scheduled on is actually rotating.
+	 */
+	perf_pmu_rotate_start();
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1487,7 +1514,7 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 	}
 }
 
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 {
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
@@ -1524,7 +1551,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 		hwc->freq_count_stamp = now;
 
 		if (delta > 0)
-			perf_adjust_period(event, TICK_NSEC, delta);
+			perf_adjust_period(event, period, delta);
 	}
 	raw_spin_unlock(&ctx->lock);
 }
@@ -1542,30 +1569,39 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	raw_spin_unlock(&ctx->lock);
 }
 
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * Cannot race with ->pmu_rotate_start() because this is ran from hardirq
+ * context, and ->pmu_rotate_start() is called with irqs disabled (both are
+ * cpu affine, so there are no SMP races).
+ */
+static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 {
+	enum hrtimer_restart restart = HRTIMER_NORESTART;
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
 	int rotate = 0;
 
-	if (!atomic_read(&nr_events))
-		return;
+	cpuctx = container_of(timer, struct perf_cpu_context, timer);
 
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-	if (cpuctx->ctx.nr_events &&
-	    cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-		rotate = 1;
+	if (cpuctx->ctx.nr_events) {
+		restart = HRTIMER_RESTART;
+		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+			rotate = 1;
+	}
 
-	ctx = curr->perf_event_ctxp;
-	if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
-		rotate = 1;
+	ctx = current->perf_event_ctxp;
+	if (ctx && ctx->nr_events) {
+		restart = HRTIMER_RESTART;
+		if (ctx->nr_events != ctx->nr_active)
+			rotate = 1;
+	}
 
-	perf_ctx_adjust_freq(&cpuctx->ctx);
+	perf_ctx_adjust_freq(&cpuctx->ctx, cpuctx->timer_interval);
 	if (ctx)
-		perf_ctx_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx, cpuctx->timer_interval);
 
 	if (!rotate)
-		return;
+		goto done;
 
 	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
@@ -1577,7 +1613,12 @@ void perf_event_task_tick(struct task_struct *curr)
 
 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+		task_ctx_sched_in(current, EVENT_FLEXIBLE);
+
+done:
+	hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval));
+
+	return restart;
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -4786,7 +4827,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 		}
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
+				HRTIMER_MODE_REL_PINNED, 0);
 	}
 }
 
@@ -5904,6 +5945,9 @@ static void __init perf_event_init_all_cpus(void)
 
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		__perf_event_init_context(&cpuctx->ctx, NULL);
+		cpuctx->timer_interval = TICK_NSEC;
+		hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cpuctx->timer.function = perf_event_context_tick;
 	}
 }
 
@@ -5934,6 +5978,8 @@ static void __perf_event_exit_cpu(void *info)
 	struct perf_event_context *ctx = &cpuctx->ctx;
 	struct perf_event *event, *tmp;
 
+	perf_pmu_rotate_stop();
+
 	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
 		__perf_event_remove_from_context(event);
 	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
diff --git a/kernel/sched.c b/kernel/sched.c
index 09b574e7f4d..66a02ba83c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3578,8 +3578,6 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	raw_spin_unlock(&rq->lock);
 
-	perf_event_task_tick(curr);
-
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
-- 
cgit v1.2.3-70-g09d2


From 108b02cfce04ee90b0a07ee0b104baffd39f5934 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Sep 2010 14:32:03 +0200
Subject: perf: Per-pmu-per-cpu contexts

Allocate per-cpu contexts per pmu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |   4 +-
 kernel/perf_event.c        | 178 ++++++++++++++++++++++++++++-----------------
 2 files changed, 113 insertions(+), 69 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fa04537df55..22155ef3b36 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -570,7 +570,8 @@ struct perf_event;
 struct pmu {
 	struct list_head		entry;
 
-	int				*pmu_disable_count;
+	int * __percpu			pmu_disable_count;
+	struct perf_cpu_context * __percpu pmu_cpu_context;
 
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
@@ -808,6 +809,7 @@ struct perf_event {
  * Used as a container for task events and CPU events as well:
  */
 struct perf_event_context {
+	struct pmu			*pmu;
 	/*
 	 * Protect the states of the events in the list,
 	 * nr_active, and the list:
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index d75e4c8727f..8ca6e690ffe 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -34,16 +34,15 @@
 
 #include <asm/irq_regs.h>
 
-/*
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
 static atomic_t nr_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
 /*
  * perf event paranoia level:
  *  -1 - not paranoid at all
@@ -78,9 +77,9 @@ void perf_pmu_enable(struct pmu *pmu)
 		pmu->pmu_enable(pmu);
 }
 
-static void perf_pmu_rotate_start(void)
+static void perf_pmu_rotate_start(struct pmu *pmu)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
 	if (hrtimer_active(&cpuctx->timer))
 		return;
@@ -90,9 +89,9 @@ static void perf_pmu_rotate_start(void)
 			HRTIMER_MODE_REL_PINNED, 0);
 }
 
-static void perf_pmu_rotate_stop(void)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
 	hrtimer_cancel(&cpuctx->timer);
 }
@@ -301,7 +300,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	if (!ctx->nr_events)
-		perf_pmu_rotate_start();
+		perf_pmu_rotate_start(ctx->pmu);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
@@ -466,6 +465,12 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -474,9 +479,9 @@ group_sched_out(struct perf_event *group_event,
  */
 static void __perf_event_remove_from_context(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -556,8 +561,8 @@ retry:
 static void __perf_event_disable(void *info)
 {
 	struct perf_event *event = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a per-task event, need to check whether this
@@ -765,10 +770,10 @@ static void add_event_to_ctx(struct perf_event *event,
  */
 static void __perf_install_in_context(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
 	/*
@@ -912,9 +917,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 static void __perf_event_enable(void *info)
 {
 	struct perf_event *event = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
 	/*
@@ -1188,15 +1193,19 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 void perf_event_task_sched_out(struct task_struct *task,
 				 struct task_struct *next)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
 	struct perf_event_context *next_ctx;
 	struct perf_event_context *parent;
+	struct perf_cpu_context *cpuctx;
 	int do_switch = 1;
 
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 
-	if (likely(!ctx || !cpuctx->task_ctx))
+	if (likely(!ctx))
+		return;
+
+	cpuctx = __get_cpu_context(ctx);
+	if (!cpuctx->task_ctx)
 		return;
 
 	rcu_read_lock();
@@ -1242,7 +1251,7 @@ void perf_event_task_sched_out(struct task_struct *task,
 static void task_ctx_sched_out(struct perf_event_context *ctx,
 			       enum event_type_t event_type)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	if (!cpuctx->task_ctx)
 		return;
@@ -1360,8 +1369,8 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void task_ctx_sched_in(struct task_struct *task,
 			      enum event_type_t event_type)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	if (likely(!ctx))
 		return;
@@ -1383,12 +1392,13 @@ static void task_ctx_sched_in(struct task_struct *task,
  */
 void perf_event_task_sched_in(struct task_struct *task)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_cpu_context *cpuctx;
 
 	if (likely(!ctx))
 		return;
 
+	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
 
@@ -1409,7 +1419,7 @@ void perf_event_task_sched_in(struct task_struct *task)
 	 * Since these rotations are per-cpu, we need to ensure the
 	 * cpu-context we got scheduled on is actually rotating.
 	 */
-	perf_pmu_rotate_start();
+	perf_pmu_rotate_start(ctx->pmu);
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1687,9 +1697,9 @@ out:
  */
 static void __perf_event_read(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -1962,7 +1972,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
 	ctx->task = task;
 }
 
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, pid_t pid, int cpu)
 {
 	struct perf_event_context *ctx;
 	struct perf_cpu_context *cpuctx;
@@ -1986,7 +1997,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 		if (!cpu_online(cpu))
 			return ERR_PTR(-ENODEV);
 
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 		get_ctx(ctx);
 
@@ -2030,6 +2041,7 @@ retry:
 		if (!ctx)
 			goto errout;
 		__perf_event_init_context(ctx, task);
+		ctx->pmu = pmu;
 		get_ctx(ctx);
 		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
 			/*
@@ -3745,18 +3757,20 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx = task_event->task_ctx;
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
 
-	rcu_read_lock();
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_task_ctx(&cpuctx->ctx, task_event);
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		perf_event_task_ctx(&cpuctx->ctx, task_event);
+	}
 	if (!ctx)
 		ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
 		perf_event_task_ctx(ctx, task_event);
-	put_cpu_var(perf_cpu_context);
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 }
 
 static void perf_event_task(struct task_struct *task,
@@ -3861,6 +3875,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
 	unsigned int size;
+	struct pmu *pmu;
 	char comm[TASK_COMM_LEN];
 
 	memset(comm, 0, sizeof(comm));
@@ -3872,14 +3887,15 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
-	rcu_read_lock();
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+	}
 	ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
 		perf_event_comm_ctx(ctx, comm_event);
-	put_cpu_var(perf_cpu_context);
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 }
 
 void perf_event_comm(struct task_struct *task)
@@ -3989,6 +4005,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 	char tmp[16];
 	char *buf = NULL;
 	const char *name;
+	struct pmu *pmu;
 
 	memset(tmp, 0, sizeof(tmp));
 
@@ -4040,14 +4057,16 @@ got_name:
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
-	rcu_read_lock();
-	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
+					vma->vm_flags & VM_EXEC);
+	}
 	ctx = rcu_dereference(current->perf_event_ctxp);
 	if (ctx)
 		perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
-	put_cpu_var(perf_cpu_context);
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 
 	kfree(buf);
 }
@@ -4982,10 +5001,6 @@ static struct pmu perf_task_clock = {
 	.read		= task_clock_event_read,
 };
 
-static LIST_HEAD(pmus);
-static DEFINE_MUTEX(pmus_lock);
-static struct srcu_struct pmus_srcu;
-
 static void perf_pmu_nop_void(struct pmu *pmu)
 {
 }
@@ -5013,7 +5028,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
 
 int perf_pmu_register(struct pmu *pmu)
 {
-	int ret;
+	int cpu, ret;
 
 	mutex_lock(&pmus_lock);
 	ret = -ENOMEM;
@@ -5021,6 +5036,21 @@ int perf_pmu_register(struct pmu *pmu)
 	if (!pmu->pmu_disable_count)
 		goto unlock;
 
+	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+	if (!pmu->pmu_cpu_context)
+		goto free_pdc;
+
+	for_each_possible_cpu(cpu) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		__perf_event_init_context(&cpuctx->ctx, NULL);
+		cpuctx->ctx.pmu = pmu;
+		cpuctx->timer_interval = TICK_NSEC;
+		hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cpuctx->timer.function = perf_event_context_tick;
+	}
+
 	if (!pmu->start_txn) {
 		if (pmu->pmu_enable) {
 			/*
@@ -5049,6 +5079,10 @@ unlock:
 	mutex_unlock(&pmus_lock);
 
 	return ret;
+
+free_pdc:
+	free_percpu(pmu->pmu_disable_count);
+	goto unlock;
 }
 
 void perf_pmu_unregister(struct pmu *pmu)
@@ -5057,9 +5091,14 @@ void perf_pmu_unregister(struct pmu *pmu)
 	list_del_rcu(&pmu->entry);
 	mutex_unlock(&pmus_lock);
 
+	/*
+	 * We use the pmu list either under SRCU or preempt_disable,
+	 * synchronize_srcu() implies synchronize_sched() so we're good.
+	 */
 	synchronize_srcu(&pmus_srcu);
 
 	free_percpu(pmu->pmu_disable_count);
+	free_percpu(pmu->pmu_cpu_context);
 }
 
 struct pmu *perf_init_event(struct perf_event *event)
@@ -5374,7 +5413,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	/*
 	 * Get the target context (task or percpu):
 	 */
-	ctx = find_get_context(pid, cpu);
+	ctx = find_get_context(event->pmu, pid, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_alloc;
@@ -5489,7 +5528,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		goto err;
 	}
 
-	ctx = find_get_context(pid, cpu);
+	ctx = find_get_context(event->pmu, pid, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_free;
@@ -5833,6 +5872,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 			return -ENOMEM;
 
 		__perf_event_init_context(child_ctx, child);
+		child_ctx->pmu = event->pmu;
 		child->perf_event_ctxp = child_ctx;
 		get_task_struct(child);
 	}
@@ -5935,30 +5975,18 @@ int perf_event_init_task(struct task_struct *child)
 
 static void __init perf_event_init_all_cpus(void)
 {
-	struct perf_cpu_context *cpuctx;
 	struct swevent_htable *swhash;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
-
-		cpuctx = &per_cpu(perf_cpu_context, cpu);
-		__perf_event_init_context(&cpuctx->ctx, NULL);
-		cpuctx->timer_interval = TICK_NSEC;
-		hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		cpuctx->timer.function = perf_event_context_tick;
 	}
 }
 
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
-	struct perf_cpu_context *cpuctx;
-	struct swevent_htable *swhash;
-
-	cpuctx = &per_cpu(perf_cpu_context, cpu);
-
-	swhash = &per_cpu(swevent_htable, cpu);
+	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
 	mutex_lock(&swhash->hlist_mutex);
 	if (swhash->hlist_refcount > 0) {
@@ -5972,32 +6000,46 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_event_exit_cpu(void *info)
+static void __perf_event_exit_context(void *__info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct perf_event_context *ctx = __info;
 	struct perf_event *event, *tmp;
 
-	perf_pmu_rotate_stop();
+	perf_pmu_rotate_stop(ctx->pmu);
 
 	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
 		__perf_event_remove_from_context(event);
 	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
 		__perf_event_remove_from_context(event);
 }
+
+static void perf_event_exit_cpu_context(int cpu)
+{
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		ctx = &this_cpu_ptr(pmu->pmu_cpu_context)->ctx;
+
+		mutex_lock(&ctx->mutex);
+		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+		mutex_unlock(&ctx->mutex);
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+
+}
+
 static void perf_event_exit_cpu(int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-	struct perf_event_context *ctx = &cpuctx->ctx;
 
 	mutex_lock(&swhash->hlist_mutex);
 	swevent_hlist_release(swhash);
 	mutex_unlock(&swhash->hlist_mutex);
 
-	mutex_lock(&ctx->mutex);
-	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-	mutex_unlock(&ctx->mutex);
+	perf_event_exit_cpu_context(cpu);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
-- 
cgit v1.2.3-70-g09d2


From 97dee4f3206622f31396dede2b5ddb8670458f56 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Sep 2010 15:35:33 +0200
Subject: perf: Move some code around

Move all inherit code near each other.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 200 ++++++++++++++++++++++++++--------------------------
 1 file changed, 100 insertions(+), 100 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8ca6e690ffe..dae0e2f3029 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5556,106 +5556,6 @@ err:
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-	      struct task_struct *parent,
-	      struct perf_event_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_event *group_leader,
-	      struct perf_event_context *child_ctx)
-{
-	struct perf_event *child_event;
-
-	/*
-	 * Instead of creating recursive hierarchies of events,
-	 * we link inherited events back to the original parent,
-	 * which has a filp for sure, which we use as the reference
-	 * count:
-	 */
-	if (parent_event->parent)
-		parent_event = parent_event->parent;
-
-	child_event = perf_event_alloc(&parent_event->attr,
-					   parent_event->cpu,
-					   group_leader, parent_event,
-					   NULL);
-	if (IS_ERR(child_event))
-		return child_event;
-	get_ctx(child_ctx);
-
-	/*
-	 * Make the child state follow the state of the parent event,
-	 * not its attr.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_event_{en, dis}able_family.
-	 */
-	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-		child_event->state = PERF_EVENT_STATE_INACTIVE;
-	else
-		child_event->state = PERF_EVENT_STATE_OFF;
-
-	if (parent_event->attr.freq) {
-		u64 sample_period = parent_event->hw.sample_period;
-		struct hw_perf_event *hwc = &child_event->hw;
-
-		hwc->sample_period = sample_period;
-		hwc->last_period   = sample_period;
-
-		local64_set(&hwc->period_left, sample_period);
-	}
-
-	child_event->ctx = child_ctx;
-	child_event->overflow_handler = parent_event->overflow_handler;
-
-	/*
-	 * Link it up in the child's context:
-	 */
-	add_event_to_ctx(child_event, child_ctx);
-
-	/*
-	 * Get a reference to the parent filp - we will fput it
-	 * when the child event exits. This is safe to do because
-	 * we are in the parent and we know that the filp still
-	 * exists and has a nonzero count:
-	 */
-	atomic_long_inc(&parent_event->filp->f_count);
-
-	/*
-	 * Link this into the parent event's child list
-	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
-	list_add_tail(&child_event->child_list, &parent_event->child_list);
-	mutex_unlock(&parent_event->child_mutex);
-
-	return child_event;
-}
-
-static int inherit_group(struct perf_event *parent_event,
-	      struct task_struct *parent,
-	      struct perf_event_context *parent_ctx,
-	      struct task_struct *child,
-	      struct perf_event_context *child_ctx)
-{
-	struct perf_event *leader;
-	struct perf_event *sub;
-	struct perf_event *child_ctr;
-
-	leader = inherit_event(parent_event, parent, parent_ctx,
-				 child, NULL, child_ctx);
-	if (IS_ERR(leader))
-		return PTR_ERR(leader);
-	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-		child_ctr = inherit_event(sub, parent, parent_ctx,
-					    child, leader, child_ctx);
-		if (IS_ERR(child_ctr))
-			return PTR_ERR(child_ctr);
-	}
-	return 0;
-}
-
 static void sync_child_event(struct perf_event *child_event,
 			       struct task_struct *child)
 {
@@ -5844,6 +5744,106 @@ again:
 	put_ctx(ctx);
 }
 
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event *group_leader,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *child_event;
+
+	/*
+	 * Instead of creating recursive hierarchies of events,
+	 * we link inherited events back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_event->parent)
+		parent_event = parent_event->parent;
+
+	child_event = perf_event_alloc(&parent_event->attr,
+					   parent_event->cpu,
+					   group_leader, parent_event,
+					   NULL);
+	if (IS_ERR(child_event))
+		return child_event;
+	get_ctx(child_ctx);
+
+	/*
+	 * Make the child state follow the state of the parent event,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_event_{en, dis}able_family.
+	 */
+	if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+		child_event->state = PERF_EVENT_STATE_INACTIVE;
+	else
+		child_event->state = PERF_EVENT_STATE_OFF;
+
+	if (parent_event->attr.freq) {
+		u64 sample_period = parent_event->hw.sample_period;
+		struct hw_perf_event *hwc = &child_event->hw;
+
+		hwc->sample_period = sample_period;
+		hwc->last_period   = sample_period;
+
+		local64_set(&hwc->period_left, sample_period);
+	}
+
+	child_event->ctx = child_ctx;
+	child_event->overflow_handler = parent_event->overflow_handler;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	add_event_to_ctx(child_event, child_ctx);
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child event exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_event->filp->f_count);
+
+	/*
+	 * Link this into the parent event's child list
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_add_tail(&child_event->child_list, &parent_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+	      struct task_struct *parent,
+	      struct perf_event_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_event_context *child_ctx)
+{
+	struct perf_event *leader;
+	struct perf_event *sub;
+	struct perf_event *child_ctr;
+
+	leader = inherit_event(parent_event, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (IS_ERR(leader))
+		return PTR_ERR(leader);
+	list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+		child_ctr = inherit_event(sub, parent, parent_ctx,
+					    child, leader, child_ctx);
+		if (IS_ERR(child_ctr))
+			return PTR_ERR(child_ctr);
+	}
+	return 0;
+}
+
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		   struct perf_event_context *parent_ctx,
-- 
cgit v1.2.3-70-g09d2


From eb184479874238393ac186c4e054d24311c34aaa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Sep 2010 15:55:13 +0200
Subject: perf: Clean up perf_event_context allocation

Unify the two perf_event_context allocation sites.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index dae0e2f3029..13d98d75634 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1959,9 +1959,7 @@ exit_put:
 /*
  * Initialize the perf_event context in a task_struct:
  */
-static void
-__perf_event_init_context(struct perf_event_context *ctx,
-			    struct task_struct *task)
+static void __perf_event_init_context(struct perf_event_context *ctx)
 {
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
@@ -1969,7 +1967,25 @@ __perf_event_init_context(struct perf_event_context *ctx,
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
-	ctx->task = task;
+}
+
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+
+	ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	__perf_event_init_context(ctx);
+	if (task) {
+		ctx->task = task;
+		get_task_struct(task);
+	}
+	ctx->pmu = pmu;
+
+	return ctx;
 }
 
 static struct perf_event_context *
@@ -2036,22 +2052,22 @@ retry:
 	}
 
 	if (!ctx) {
-		ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+		ctx = alloc_perf_context(pmu, task);
 		err = -ENOMEM;
 		if (!ctx)
 			goto errout;
-		__perf_event_init_context(ctx, task);
-		ctx->pmu = pmu;
+
 		get_ctx(ctx);
+
 		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
 			 */
+			put_task_struct(task);
 			kfree(ctx);
 			goto retry;
 		}
-		get_task_struct(task);
 	}
 
 	put_task_struct(task);
@@ -5044,7 +5060,7 @@ int perf_pmu_register(struct pmu *pmu)
 		struct perf_cpu_context *cpuctx;
 
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-		__perf_event_init_context(&cpuctx->ctx, NULL);
+		__perf_event_init_context(&cpuctx->ctx);
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->timer_interval = TICK_NSEC;
 		hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -5866,15 +5882,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		 * child.
 		 */
 
-		child_ctx = kzalloc(sizeof(struct perf_event_context),
-				    GFP_KERNEL);
+		child_ctx = alloc_perf_context(event->pmu, child);
 		if (!child_ctx)
 			return -ENOMEM;
 
-		__perf_event_init_context(child_ctx, child);
-		child_ctx->pmu = event->pmu;
 		child->perf_event_ctxp = child_ctx;
-		get_task_struct(child);
 	}
 
 	ret = inherit_group(event, parent, parent_ctx,
@@ -5886,7 +5898,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 	return ret;
 }
 
-
 /*
  * Initialize the perf_event context in task_struct
  */
-- 
cgit v1.2.3-70-g09d2


From 8dc85d547285668e509f86c177bcd4ea055bcaaf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Sep 2010 16:50:03 +0200
Subject: perf: Multiple task contexts

Provide the infrastructure for multiple task contexts.

A more flexible approach would have resulted in more pointer chases
in the scheduling hot-paths. This approach has the limitation of a
static number of task contexts.

Since I expect most external PMUs to be system wide, or at least node
wide (as per the intel uncore unit) they won't actually need a task
context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |   1 +
 include/linux/sched.h      |   8 +-
 kernel/perf_event.c        | 336 +++++++++++++++++++++++++++++++--------------
 3 files changed, 239 insertions(+), 106 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 22155ef3b36..9ecfd856ce6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -572,6 +572,7 @@ struct pmu {
 
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
+	int				task_ctx_nr;
 
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7d..89d6023c6f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,6 +1160,12 @@ struct sched_rt_entity {
 
 struct rcu_node;
 
+enum perf_event_task_context {
+	perf_invalid_context = -1,
+	perf_hw_context = 0,
+	perf_nr_task_contexts,
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1431,7 +1437,7 @@ struct task_struct {
 	struct futex_pi_state *pi_state_cache;
 #endif
 #ifdef CONFIG_PERF_EVENTS
-	struct perf_event_context *perf_event_ctxp;
+	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex perf_event_mutex;
 	struct list_head perf_event_list;
 #endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 13d98d75634..7223ea87586 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -148,13 +148,13 @@ static u64 primary_event_id(struct perf_event *event)
  * the context could get moved to another task.
  */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
 	struct perf_event_context *ctx;
 
 	rcu_read_lock();
 retry:
-	ctx = rcu_dereference(task->perf_event_ctxp);
+	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
 	if (ctx) {
 		/*
 		 * If this context is a clone of another, it might
@@ -167,7 +167,7 @@ retry:
 		 * can't get swapped on us any more.
 		 */
 		raw_spin_lock_irqsave(&ctx->lock, *flags);
-		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
 			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 			goto retry;
 		}
@@ -186,12 +186,13 @@ retry:
  * can't get swapped to another task.  This also increments its
  * reference count so that the context can't get freed.
  */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
 {
 	struct perf_event_context *ctx;
 	unsigned long flags;
 
-	ctx = perf_lock_task_context(task, &flags);
+	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		++ctx->pin_count;
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1179,28 +1180,15 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 	}
 }
 
-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-				 struct task_struct *next)
+void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+				  struct task_struct *next)
 {
-	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
 	struct perf_event_context *next_ctx;
 	struct perf_event_context *parent;
 	struct perf_cpu_context *cpuctx;
 	int do_switch = 1;
 
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-
 	if (likely(!ctx))
 		return;
 
@@ -1210,7 +1198,7 @@ void perf_event_task_sched_out(struct task_struct *task,
 
 	rcu_read_lock();
 	parent = rcu_dereference(ctx->parent_ctx);
-	next_ctx = next->perf_event_ctxp;
+	next_ctx = next->perf_event_ctxp[ctxn];
 	if (parent && next_ctx &&
 	    rcu_dereference(next_ctx->parent_ctx) == parent) {
 		/*
@@ -1229,8 +1217,8 @@ void perf_event_task_sched_out(struct task_struct *task,
 			 * XXX do we need a memory barrier of sorts
 			 * wrt to rcu_dereference() of perf_event_ctxp
 			 */
-			task->perf_event_ctxp = next_ctx;
-			next->perf_event_ctxp = ctx;
+			task->perf_event_ctxp[ctxn] = next_ctx;
+			next->perf_event_ctxp[ctxn] = ctx;
 			ctx->task = next;
 			next_ctx->task = task;
 			do_switch = 0;
@@ -1248,6 +1236,31 @@ void perf_event_task_sched_out(struct task_struct *task,
 	}
 }
 
+#define for_each_task_context_nr(ctxn)					\
+	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+			       struct task_struct *next)
+{
+	int ctxn;
+
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+	for_each_task_context_nr(ctxn)
+		perf_event_context_sched_out(task, ctxn, next);
+}
+
 static void task_ctx_sched_out(struct perf_event_context *ctx,
 			       enum event_type_t event_type)
 {
@@ -1366,38 +1379,23 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 	ctx_sched_in(ctx, cpuctx, event_type);
 }
 
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
 			      enum event_type_t event_type)
 {
-	struct perf_event_context *ctx = task->perf_event_ctxp;
-	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	struct perf_cpu_context *cpuctx;
 
-	if (likely(!ctx))
-		return;
+       	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
+
 	ctx_sched_in(ctx, cpuctx, event_type);
 	cpuctx->task_ctx = ctx;
 }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
+
+void perf_event_context_sched_in(struct perf_event_context *ctx)
 {
-	struct perf_event_context *ctx = task->perf_event_ctxp;
 	struct perf_cpu_context *cpuctx;
 
-	if (likely(!ctx))
-		return;
-
 	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
 		return;
@@ -1422,6 +1420,31 @@ void perf_event_task_sched_in(struct task_struct *task)
 	perf_pmu_rotate_start(ctx->pmu);
 }
 
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void perf_event_task_sched_in(struct task_struct *task)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	for_each_task_context_nr(ctxn) {
+		ctx = task->perf_event_ctxp[ctxn];
+		if (likely(!ctx))
+			continue;
+
+		perf_event_context_sched_in(ctx);
+	}
+}
+
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_event *event, int enable);
@@ -1588,7 +1611,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 {
 	enum hrtimer_restart restart = HRTIMER_NORESTART;
 	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
+	struct perf_event_context *ctx = NULL;
 	int rotate = 0;
 
 	cpuctx = container_of(timer, struct perf_cpu_context, timer);
@@ -1599,7 +1622,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 			rotate = 1;
 	}
 
-	ctx = current->perf_event_ctxp;
+	ctx = cpuctx->task_ctx;
 	if (ctx && ctx->nr_events) {
 		restart = HRTIMER_RESTART;
 		if (ctx->nr_events != ctx->nr_active)
@@ -1623,7 +1646,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 
 	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		task_ctx_sched_in(current, EVENT_FLEXIBLE);
+		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
 done:
 	hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval));
@@ -1650,20 +1673,18 @@ static int event_enable_on_exec(struct perf_event *event,
  * Enable all of a task's events that have been marked enable-on-exec.
  * This expects task == current.
  */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
-	struct perf_event_context *ctx;
 	struct perf_event *event;
 	unsigned long flags;
 	int enabled = 0;
 	int ret;
 
 	local_irq_save(flags);
-	ctx = task->perf_event_ctxp;
 	if (!ctx || !ctx->nr_events)
 		goto out;
 
-	__perf_event_task_sched_out(ctx);
+	task_ctx_sched_out(ctx, EVENT_ALL);
 
 	raw_spin_lock(&ctx->lock);
 
@@ -1687,7 +1708,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	raw_spin_unlock(&ctx->lock);
 
-	perf_event_task_sched_in(task);
+	perf_event_context_sched_in(ctx);
 out:
 	local_irq_restore(flags);
 }
@@ -1995,7 +2016,7 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu)
 	struct perf_cpu_context *cpuctx;
 	struct task_struct *task;
 	unsigned long flags;
-	int err;
+	int ctxn, err;
 
 	if (pid == -1 && cpu != -1) {
 		/* Must be root to operate on a CPU event: */
@@ -2044,8 +2065,13 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto errout;
 
+	err = -EINVAL;
+	ctxn = pmu->task_ctx_nr;
+	if (ctxn < 0)
+		goto errout;
+
 retry:
-	ctx = perf_lock_task_context(task, &flags);
+	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		unclone_ctx(ctx);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -2059,7 +2085,7 @@ retry:
 
 		get_ctx(ctx);
 
-		if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+		if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
@@ -3773,19 +3799,26 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
-	struct perf_event_context *ctx = task_event->task_ctx;
 	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
 	struct pmu *pmu;
+	int ctxn;
 
 	rcu_read_lock_sched();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_task_ctx(&cpuctx->ctx, task_event);
+
+		ctx = task_event->task_ctx;
+		if (!ctx) {
+			ctxn = pmu->task_ctx_nr;
+			if (ctxn < 0)
+				continue;
+			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		}
+		if (ctx)
+			perf_event_task_ctx(ctx, task_event);
 	}
-	if (!ctx)
-		ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_event_task_ctx(ctx, task_event);
 	rcu_read_unlock_sched();
 }
 
@@ -3890,9 +3923,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
+	char comm[TASK_COMM_LEN];
 	unsigned int size;
 	struct pmu *pmu;
-	char comm[TASK_COMM_LEN];
+	int ctxn;
 
 	memset(comm, 0, sizeof(comm));
 	strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3907,19 +3941,31 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+
+		ctxn = pmu->task_ctx_nr;
+		if (ctxn < 0)
+			continue;
+
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx)
+			perf_event_comm_ctx(ctx, comm_event);
 	}
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_event_comm_ctx(ctx, comm_event);
 	rcu_read_unlock_sched();
 }
 
 void perf_event_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
+	struct perf_event_context *ctx;
+	int ctxn;
 
-	if (task->perf_event_ctxp)
-		perf_event_enable_on_exec(task);
+	for_each_task_context_nr(ctxn) {
+		ctx = task->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
+
+		perf_event_enable_on_exec(ctx);
+	}
 
 	if (!atomic_read(&nr_comm_events))
 		return;
@@ -4022,6 +4068,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 	char *buf = NULL;
 	const char *name;
 	struct pmu *pmu;
+	int ctxn;
 
 	memset(tmp, 0, sizeof(tmp));
 
@@ -4078,10 +4125,17 @@ got_name:
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
 					vma->vm_flags & VM_EXEC);
+
+		ctxn = pmu->task_ctx_nr;
+		if (ctxn < 0)
+			continue;
+
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx) {
+			perf_event_mmap_ctx(ctx, mmap_event,
+					vma->vm_flags & VM_EXEC);
+		}
 	}
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
 	rcu_read_unlock_sched();
 
 	kfree(buf);
@@ -5042,6 +5096,43 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
 	perf_pmu_enable(pmu);
 }
 
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
+{
+	struct pmu *pmu;
+
+	if (ctxn < 0)
+		return NULL;
+
+	list_for_each_entry(pmu, &pmus, entry) {
+		if (pmu->task_ctx_nr == ctxn)
+			return pmu->pmu_cpu_context;
+	}
+
+	return NULL;
+}
+
+static void free_pmu_context(void * __percpu cpu_context)
+{
+	struct pmu *pmu;
+
+	mutex_lock(&pmus_lock);
+	/*
+	 * Like a real lame refcount.
+	 */
+	list_for_each_entry(pmu, &pmus, entry) {
+		if (pmu->pmu_cpu_context == cpu_context)
+			goto out;
+	}
+
+	free_percpu(cpu_context);
+out:
+	mutex_unlock(&pmus_lock);
+}
+
 int perf_pmu_register(struct pmu *pmu)
 {
 	int cpu, ret;
@@ -5052,6 +5143,10 @@ int perf_pmu_register(struct pmu *pmu)
 	if (!pmu->pmu_disable_count)
 		goto unlock;
 
+	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+	if (pmu->pmu_cpu_context)
+		goto got_cpu_context;
+
 	pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
 	if (!pmu->pmu_cpu_context)
 		goto free_pdc;
@@ -5067,6 +5162,7 @@ int perf_pmu_register(struct pmu *pmu)
 		cpuctx->timer.function = perf_event_context_tick;
 	}
 
+got_cpu_context:
 	if (!pmu->start_txn) {
 		if (pmu->pmu_enable) {
 			/*
@@ -5114,7 +5210,7 @@ void perf_pmu_unregister(struct pmu *pmu)
 	synchronize_srcu(&pmus_srcu);
 
 	free_percpu(pmu->pmu_disable_count);
-	free_percpu(pmu->pmu_cpu_context);
+	free_pmu_context(pmu->pmu_cpu_context);
 }
 
 struct pmu *perf_init_event(struct perf_event *event)
@@ -5628,16 +5724,13 @@ __perf_event_exit_task(struct perf_event *child_event,
 	}
 }
 
-/*
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
 	struct perf_event *child_event, *tmp;
 	struct perf_event_context *child_ctx;
 	unsigned long flags;
 
-	if (likely(!child->perf_event_ctxp)) {
+	if (likely(!child->perf_event_ctxp[ctxn])) {
 		perf_event_task(child, NULL, 0);
 		return;
 	}
@@ -5649,7 +5742,7 @@ void perf_event_exit_task(struct task_struct *child)
 	 * scheduled, so we are now safe from rescheduling changing
 	 * our context.
 	 */
-	child_ctx = child->perf_event_ctxp;
+	child_ctx = child->perf_event_ctxp[ctxn];
 	__perf_event_task_sched_out(child_ctx);
 
 	/*
@@ -5658,7 +5751,7 @@ void perf_event_exit_task(struct task_struct *child)
 	 * incremented the context's refcount before we do put_ctx below.
 	 */
 	raw_spin_lock(&child_ctx->lock);
-	child->perf_event_ctxp = NULL;
+	child->perf_event_ctxp[ctxn] = NULL;
 	/*
 	 * If this context is a clone; unclone it so it can't get
 	 * swapped to another process while we're removing all
@@ -5711,6 +5804,17 @@ again:
 	put_ctx(child_ctx);
 }
 
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+	int ctxn;
+
+	for_each_task_context_nr(ctxn)
+		perf_event_exit_task_context(child, ctxn);
+}
+
 static void perf_free_event(struct perf_event *event,
 			    struct perf_event_context *ctx)
 {
@@ -5732,32 +5836,37 @@ static void perf_free_event(struct perf_event *event,
 
 /*
  * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
  */
 void perf_event_free_task(struct task_struct *task)
 {
-	struct perf_event_context *ctx = task->perf_event_ctxp;
+	struct perf_event_context *ctx;
 	struct perf_event *event, *tmp;
+	int ctxn;
 
-	if (!ctx)
-		return;
+	for_each_task_context_nr(ctxn) {
+		ctx = task->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
 
-	mutex_lock(&ctx->mutex);
+		mutex_lock(&ctx->mutex);
 again:
-	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-		perf_free_event(event, ctx);
+		list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
+				group_entry)
+			perf_free_event(event, ctx);
 
-	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-				 group_entry)
-		perf_free_event(event, ctx);
+		list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+				group_entry)
+			perf_free_event(event, ctx);
 
-	if (!list_empty(&ctx->pinned_groups) ||
-	    !list_empty(&ctx->flexible_groups))
-		goto again;
+		if (!list_empty(&ctx->pinned_groups) ||
+				!list_empty(&ctx->flexible_groups))
+			goto again;
 
-	mutex_unlock(&ctx->mutex);
+		mutex_unlock(&ctx->mutex);
 
-	put_ctx(ctx);
+		put_ctx(ctx);
+	}
 }
 
 /*
@@ -5863,17 +5972,18 @@ static int inherit_group(struct perf_event *parent_event,
 static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		   struct perf_event_context *parent_ctx,
-		   struct task_struct *child,
+		   struct task_struct *child, int ctxn,
 		   int *inherited_all)
 {
 	int ret;
-	struct perf_event_context *child_ctx = child->perf_event_ctxp;
+	struct perf_event_context *child_ctx;
 
 	if (!event->attr.inherit) {
 		*inherited_all = 0;
 		return 0;
 	}
 
+       	child_ctx = child->perf_event_ctxp[ctxn];
 	if (!child_ctx) {
 		/*
 		 * This is executed from the parent task context, so
@@ -5886,7 +5996,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		if (!child_ctx)
 			return -ENOMEM;
 
-		child->perf_event_ctxp = child_ctx;
+		child->perf_event_ctxp[ctxn] = child_ctx;
 	}
 
 	ret = inherit_group(event, parent, parent_ctx,
@@ -5901,7 +6011,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
 {
 	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
@@ -5910,19 +6020,19 @@ int perf_event_init_task(struct task_struct *child)
 	int inherited_all = 1;
 	int ret = 0;
 
-	child->perf_event_ctxp = NULL;
+	child->perf_event_ctxp[ctxn] = NULL;
 
 	mutex_init(&child->perf_event_mutex);
 	INIT_LIST_HEAD(&child->perf_event_list);
 
-	if (likely(!parent->perf_event_ctxp))
+	if (likely(!parent->perf_event_ctxp[ctxn]))
 		return 0;
 
 	/*
 	 * If the parent's context is a clone, pin it so it won't get
 	 * swapped under us.
 	 */
-	parent_ctx = perf_pin_task_context(parent);
+	parent_ctx = perf_pin_task_context(parent, ctxn);
 
 	/*
 	 * No need to check if parent_ctx != NULL here; since we saw
@@ -5942,20 +6052,20 @@ int perf_event_init_task(struct task_struct *child)
 	 * the list, not manipulating it:
 	 */
 	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx, child,
-					 &inherited_all);
+		ret = inherit_task_group(event, parent, parent_ctx,
+					 child, ctxn, &inherited_all);
 		if (ret)
 			break;
 	}
 
 	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx, child,
-					 &inherited_all);
+		ret = inherit_task_group(event, parent, parent_ctx,
+					 child, ctxn, &inherited_all);
 		if (ret)
 			break;
 	}
 
-	child_ctx = child->perf_event_ctxp;
+	child_ctx = child->perf_event_ctxp[ctxn];
 
 	if (child_ctx && inherited_all) {
 		/*
@@ -5984,6 +6094,22 @@ int perf_event_init_task(struct task_struct *child)
 	return ret;
 }
 
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+	int ctxn, ret;
+
+	for_each_task_context_nr(ctxn) {
+		ret = perf_event_init_context(child, ctxn);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static void __init perf_event_init_all_cpus(void)
 {
 	struct swevent_htable *swhash;
-- 
cgit v1.2.3-70-g09d2


From 89a1e18731959e9953fae15ddc1a983eb15a4f19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Sep 2010 17:34:50 +0200
Subject: perf: Provide a separate task context for swevents

Since software events are always schedulable, mixing them up with
hardware events (who are not) can lead to funny scheduling oddities.

Giving them their own context solves this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  9 +--------
 include/linux/sched.h      |  1 +
 kernel/hw_breakpoint.c     |  2 ++
 kernel/perf_event.c        | 40 +++++++++++++++++++++++++++++-----------
 4 files changed, 33 insertions(+), 19 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9ecfd856ce6..c1173520f14 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -952,14 +952,7 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
  */
 static inline int is_software_event(struct perf_event *event)
 {
-	switch (event->attr.type) {
-	case PERF_TYPE_SOFTWARE:
-	case PERF_TYPE_TRACEPOINT:
-	/* for now the breakpoint stuff also works as software event */
-	case PERF_TYPE_BREAKPOINT:
-		return 1;
-	}
-	return 0;
+	return event->pmu->task_ctx_nr == perf_sw_context;
 }
 
 extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89d6023c6f8..eb3c1ceec06 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1163,6 +1163,7 @@ struct rcu_node;
 enum perf_event_task_context {
 	perf_invalid_context = -1,
 	perf_hw_context = 0,
+	perf_sw_context,
 	perf_nr_task_contexts,
 };
 
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 6f150095caf..3b2aaffb65f 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -610,6 +610,8 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
 }
 
 static struct pmu perf_breakpoint = {
+	.task_ctx_nr	= perf_sw_context, /* could eventually get its own */
+
 	.event_init	= hw_breakpoint_event_init,
 	.add		= hw_breakpoint_add,
 	.del		= hw_breakpoint_del,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7223ea87586..357ee8d5e8a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4709,6 +4709,8 @@ static int perf_swevent_init(struct perf_event *event)
 }
 
 static struct pmu perf_swevent = {
+	.task_ctx_nr	= perf_sw_context,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -4800,6 +4802,8 @@ static int perf_tp_event_init(struct perf_event *event)
 }
 
 static struct pmu perf_tracepoint = {
+	.task_ctx_nr	= perf_sw_context,
+
 	.event_init	= perf_tp_event_init,
 	.add		= perf_trace_add,
 	.del		= perf_trace_del,
@@ -4988,6 +4992,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 }
 
 static struct pmu perf_cpu_clock = {
+	.task_ctx_nr	= perf_sw_context,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -5063,6 +5069,8 @@ static int task_clock_event_init(struct perf_event *event)
 }
 
 static struct pmu perf_task_clock = {
+	.task_ctx_nr	= perf_sw_context,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -5490,6 +5498,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
 	struct file *group_file = NULL;
+	struct pmu *pmu;
 	int event_fd;
 	int fput_needed = 0;
 	int err;
@@ -5522,20 +5531,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_fd;
 	}
 
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(event->pmu, pid, cpu);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
-		goto err_alloc;
-	}
-
 	if (group_fd != -1) {
 		group_leader = perf_fget_light(group_fd, &fput_needed);
 		if (IS_ERR(group_leader)) {
 			err = PTR_ERR(group_leader);
-			goto err_context;
+			goto err_alloc;
 		}
 		group_file = group_leader->filp;
 		if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5544,6 +5544,23 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
+	/*
+	 * Special case software events and allow them to be part of
+	 * any hardware group.
+	 */
+	pmu = event->pmu;
+	if ((pmu->task_ctx_nr == perf_sw_context) && group_leader)
+		pmu = group_leader->pmu;
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+	ctx = find_get_context(pmu, pid, cpu);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto err_group_fd;
+	}
+
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -5605,8 +5622,9 @@ SYSCALL_DEFINE5(perf_event_open,
 	return event_fd;
 
 err_context:
-	fput_light(group_file, fput_needed);
 	put_ctx(ctx);
+err_group_fd:
+	fput_light(group_file, fput_needed);
 err_alloc:
 	free_event(event);
 err_fd:
-- 
cgit v1.2.3-70-g09d2


From 1b9a644fece117cfa5474a2388d6b89d1baf8ddf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Sep 2010 18:32:22 +0200
Subject: perf: Optimize context ops

Assuming we don't mix events of different pmus onto a single context
(with the exeption of software events inside a hardware group) we can
now assume that all events on a particular context belong to the same
pmu, hence we can disable the pmu for the entire context operations.

This reduces the amount of hardware writes.

The exception for swevents comes from the fact that the sw pmu disable
is a nop.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 357ee8d5e8a..9819a69a61a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1065,6 +1065,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 	struct perf_event *event;
 
 	raw_spin_lock(&ctx->lock);
+	perf_pmu_disable(ctx->pmu);
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_events))
 		goto out;
@@ -1083,6 +1084,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			group_sched_out(event, cpuctx, ctx);
 	}
 out:
+	perf_pmu_enable(ctx->pmu);
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -1400,6 +1402,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
 	if (cpuctx->task_ctx == ctx)
 		return;
 
+	perf_pmu_disable(ctx->pmu);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -1418,6 +1421,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
 	 * cpu-context we got scheduled on is actually rotating.
 	 */
 	perf_pmu_rotate_start(ctx->pmu);
+	perf_pmu_enable(ctx->pmu);
 }
 
 /*
@@ -1629,6 +1633,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 			rotate = 1;
 	}
 
+	perf_pmu_disable(cpuctx->ctx.pmu);
 	perf_ctx_adjust_freq(&cpuctx->ctx, cpuctx->timer_interval);
 	if (ctx)
 		perf_ctx_adjust_freq(ctx, cpuctx->timer_interval);
@@ -1649,6 +1654,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
 done:
+	perf_pmu_enable(cpuctx->ctx.pmu);
 	hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval));
 
 	return restart;
-- 
cgit v1.2.3-70-g09d2


From 4e231c7962ce711c7d8c2a4dc23ecd1e8fc28363 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Sep 2010 21:01:59 +0200
Subject: perf: Fix up delayed_put_task_struct()

I missed a perf_event_ctxp user when converting it to an array. Pull this
last user into perf_event.c as well and fix it up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 2 ++
 kernel/exit.c              | 4 +---
 kernel/perf_event.c        | 8 ++++++++
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c1173520f14..93bf53aa50e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -889,6 +889,7 @@ extern void perf_event_task_sched_out(struct task_struct *task, struct task_stru
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
+extern void perf_event_delayed_put(struct task_struct *task);
 extern void set_perf_event_pending(void);
 extern void perf_event_do_pending(void);
 extern void perf_event_print_debug(void);
@@ -1067,6 +1068,7 @@ perf_event_task_sched_out(struct task_struct *task,
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
+static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline void perf_event_do_pending(void)				{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db2..e2bdf37f9fd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
-#ifdef CONFIG_PERF_EVENTS
-	WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
+	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9819a69a61a..eaf1c5de6dc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5893,6 +5893,14 @@ again:
 	}
 }
 
+void perf_event_delayed_put(struct task_struct *task)
+{
+	int ctxn;
+
+	for_each_task_context_nr(ctxn)
+		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
cgit v1.2.3-70-g09d2


From cee010ec5211b96f33c5c2208f5c14ebb04b634a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 10 Sep 2010 12:51:54 +0200
Subject: perf: Ensure we call add_event_to_ctx() with the right locks held

Even though we call it from the inherit path, where the child is
not yet accessible, we need to hold ctx->lock, add_event_to_ctx()
assumes IRQs are disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eaf1c5de6dc..f395fb4d9b7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5913,6 +5913,7 @@ inherit_event(struct perf_event *parent_event,
 	      struct perf_event_context *child_ctx)
 {
 	struct perf_event *child_event;
+	unsigned long flags;
 
 	/*
 	 * Instead of creating recursive hierarchies of events,
@@ -5957,7 +5958,9 @@ inherit_event(struct perf_event *parent_event,
 	/*
 	 * Link it up in the child's context:
 	 */
+	raw_spin_lock_irqsave(&child_ctx->lock, flags);
 	add_event_to_ctx(child_event, child_ctx);
+	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
 
 	/*
 	 * Get a reference to the parent filp - we will fput it
-- 
cgit v1.2.3-70-g09d2


From e5f4d3394a52ac351f1a479fe136d92fa5228eff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 10 Sep 2010 17:38:06 +0200
Subject: perf: Fix perf_init_event()

We ought to return -ENOENT when non of the registered PMUs
recognise the requested event.

This fixes a boot crash that occurs if no PMU is available
but the NMI watchdog tries to register an event.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f395fb4d9b7..f29b52576ec 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5236,12 +5236,15 @@ struct pmu *perf_init_event(struct perf_event *event)
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		int ret = pmu->event_init(event);
 		if (!ret)
-			break;
+			goto unlock;
+
 		if (ret != -ENOENT) {
 			pmu = ERR_PTR(ret);
-			break;
+			goto unlock;
 		}
 	}
+	pmu = ERR_PTR(-ENOENT);
+unlock:
 	srcu_read_unlock(&pmus_srcu, idx);
 
 	return pmu;
-- 
cgit v1.2.3-70-g09d2


From cde8e88498c8de69271fcb6d4dd974979368fa67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 13 Sep 2010 11:06:55 +0200
Subject: perf: Sanitize the RCU logic

Simplify things and simply synchronize against two RCU variants for
PMU unregister -- we don't care about performance, its module unload
if anything.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f29b52576ec..bc46bff6962 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3810,7 +3810,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 	struct pmu *pmu;
 	int ctxn;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_task_ctx(&cpuctx->ctx, task_event);
@@ -3825,7 +3825,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 		if (ctx)
 			perf_event_task_ctx(ctx, task_event);
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
 }
 
 static void perf_event_task(struct task_struct *task,
@@ -3943,7 +3943,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
@@ -3956,7 +3956,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 		if (ctx)
 			perf_event_comm_ctx(ctx, comm_event);
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
 }
 
 void perf_event_comm(struct task_struct *task)
@@ -4126,7 +4126,7 @@ got_name:
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
-	rcu_read_lock_sched();
+	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
@@ -4142,7 +4142,7 @@ got_name:
 					vma->vm_flags & VM_EXEC);
 		}
 	}
-	rcu_read_unlock_sched();
+	rcu_read_unlock();
 
 	kfree(buf);
 }
@@ -5218,10 +5218,11 @@ void perf_pmu_unregister(struct pmu *pmu)
 	mutex_unlock(&pmus_lock);
 
 	/*
-	 * We use the pmu list either under SRCU or preempt_disable,
-	 * synchronize_srcu() implies synchronize_sched() so we're good.
+	 * We dereference the pmu list under both SRCU and regular RCU, so
+	 * synchronize against both of those.
 	 */
 	synchronize_srcu(&pmus_srcu);
+	synchronize_rcu();
 
 	free_percpu(pmu->pmu_disable_count);
 	free_pmu_context(pmu->pmu_cpu_context);
-- 
cgit v1.2.3-70-g09d2


From 0c67b40872326a5340cab51d79a192a5fbaeb484 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 13 Sep 2010 11:15:58 +0200
Subject: perf: Fix free_event()

With the context rework stuff we can actually end up freeing an event
before it gets attached to a context.

Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bc46bff6962..440f9ca067b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2150,7 +2150,9 @@ static void free_event(struct perf_event *event)
 	if (event->destroy)
 		event->destroy(event);
 
-	put_ctx(event->ctx);
+	if (event->ctx)
+		put_ctx(event->ctx);
+
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 2ebd4ffb6d0cb877787b1e42be8485820158857e Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 13 Sep 2010 13:01:19 -0700
Subject: perf events: Split out task search into helper

Split out the code which searches for non-exiting tasks into its own
helper. Creating this helper not only makes the code slightly more
readable it prepares to move the search out of find_get_context() in
a subsequent commit.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robin Green <greenrd@greenrd.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
LKML-Reference: <561205417b450b8a4bf7488374541d64b4690431.1284407762.git.matthltc@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 63 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 23 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 440f9ca067b..3f5309db72f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2015,6 +2015,43 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
 	return ctx;
 }
 
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
+	struct task_struct *task;
+	int err;
+
+	rcu_read_lock();
+	if (!vpid)
+		task = current;
+	else
+		task = find_task_by_vpid(vpid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+
+	if (!task)
+		return ERR_PTR(-ESRCH);
+
+	/*
+	 * Can't attach events to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
+	/* Reuse ptrace permission checks for now. */
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+	return task;
+errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
+
+}
+
 static struct perf_event_context *
 find_get_context(struct pmu *pmu, pid_t pid, int cpu)
 {
@@ -2047,29 +2084,9 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu)
 		return ctx;
 	}
 
-	rcu_read_lock();
-	if (!pid)
-		task = current;
-	else
-		task = find_task_by_vpid(pid);
-	if (task)
-		get_task_struct(task);
-	rcu_read_unlock();
-
-	if (!task)
-		return ERR_PTR(-ESRCH);
-
-	/*
-	 * Can't attach events to a dying task.
-	 */
-	err = -ESRCH;
-	if (task->flags & PF_EXITING)
-		goto errout;
-
-	/* Reuse ptrace permission checks for now. */
-	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto errout;
+	task = find_lively_task_by_vpid(pid);
+	if (IS_ERR(task))
+		return (void*)task;
 
 	err = -EINVAL;
 	ctxn = pmu->task_ctx_nr;
-- 
cgit v1.2.3-70-g09d2


From 38a81da2205f94e8a2a834b51a6b99c91fc7c2e8 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 13 Sep 2010 13:01:20 -0700
Subject: perf events: Clean up pid passing

The kernel perf event creation path shouldn't use find_task_by_vpid()
because a vpid exists in a specific namespace. find_task_by_vpid() uses
current's pid namespace which isn't always the correct namespace to use
for the vpid in all the places perf_event_create_kernel_counter() (and
thus find_get_context()) is called.

The goal is to clean up pid namespace handling and prevent bugs like:

	https://bugzilla.kernel.org/show_bug.cgi?id=17281

Instead of using pids switch find_get_context() to use task struct
pointers directly. The syscall is responsible for resolving the pid to
a task struct. This moves the pid namespace resolution into the syscall
much like every other syscall that takes pid parameters.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robin Green <greenrd@greenrd.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
LKML-Reference: <a134e5e392ab0204961fd1a62c84a222bf5874a9.1284407763.git.matthltc@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/oprofile/common.c |  2 +-
 include/linux/perf_event.h |  2 +-
 kernel/hw_breakpoint.c     |  5 ++---
 kernel/perf_event.c        | 21 ++++++++++-----------
 kernel/watchdog.c          |  2 +-
 5 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c
index 0691176899f..aad63e611b3 100644
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -96,7 +96,7 @@ static int op_create_counter(int cpu, int event)
 		return ret;
 
 	pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
-						  cpu, -1,
+						  cpu, NULL,
 						  op_overflow_handler);
 
 	if (IS_ERR(pevent)) {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 93bf53aa50e..39d8860b268 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -902,7 +902,7 @@ extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
-				pid_t pid,
+				struct task_struct *task,
 				perf_overflow_handler_t callback);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 6122f02cfed..3b714e839c1 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
 			    struct task_struct *tsk)
 {
-	return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
-						triggered);
+	return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+		bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
 
 		*pevent = bp;
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3f5309db72f..86f394e15d5 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2053,15 +2053,14 @@ errout:
 }
 
 static struct perf_event_context *
-find_get_context(struct pmu *pmu, pid_t pid, int cpu)
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 {
 	struct perf_event_context *ctx;
 	struct perf_cpu_context *cpuctx;
-	struct task_struct *task;
 	unsigned long flags;
 	int ctxn, err;
 
-	if (pid == -1 && cpu != -1) {
+	if (!task && cpu != -1) {
 		/* Must be root to operate on a CPU event: */
 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
@@ -2084,10 +2083,6 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu)
 		return ctx;
 	}
 
-	task = find_lively_task_by_vpid(pid);
-	if (IS_ERR(task))
-		return (void*)task;
-
 	err = -EINVAL;
 	ctxn = pmu->task_ctx_nr;
 	if (ctxn < 0)
@@ -5527,6 +5522,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
 	struct file *group_file = NULL;
+	struct task_struct *task = NULL;
 	struct pmu *pmu;
 	int event_fd;
 	int fput_needed = 0;
@@ -5581,10 +5577,13 @@ SYSCALL_DEFINE5(perf_event_open,
 	if ((pmu->task_ctx_nr == perf_sw_context) && group_leader)
 		pmu = group_leader->pmu;
 
+	if (pid != -1)
+		task = find_lively_task_by_vpid(pid);
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
-	ctx = find_get_context(pmu, pid, cpu);
+	ctx = find_get_context(pmu, task, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_group_fd;
@@ -5666,11 +5665,11 @@ err_fd:
  *
  * @attr: attributes of the counter to create
  * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
  */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-				 pid_t pid,
+				 struct task_struct *task,
 				 perf_overflow_handler_t overflow_handler)
 {
 	struct perf_event_context *ctx;
@@ -5687,7 +5686,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		goto err;
 	}
 
-	ctx = find_get_context(event->pmu, pid, cpu);
+	ctx = find_get_context(event->pmu, task, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto err_free;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 89eadbb9cef..dc8e16824b5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -358,7 +358,7 @@ static int watchdog_nmi_enable(int cpu)
 	/* Try to register using hardware perf events */
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period();
-	event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
 	if (!IS_ERR(event)) {
 		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
 		goto out_save;
-- 
cgit v1.2.3-70-g09d2


From d14b12d7adbf214f33eb59f800b5c3d5ed9268e8 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 17 Sep 2010 11:28:47 +0200
Subject: perf_events: Fix broken event grouping

Events were not grouped anymore. The reason was that in
perf_event_open(), the field event->group_leader was
initialized before the function looked up the group_fd
to find the event leader. This patch fixes this by
reordering the code correctly.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <20100917093009.360420946@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 86f394e15d5..ce95617f5d2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5550,17 +5550,11 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (event_fd < 0)
 		return event_fd;
 
-	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
-	if (IS_ERR(event)) {
-		err = PTR_ERR(event);
-		goto err_fd;
-	}
-
 	if (group_fd != -1) {
 		group_leader = perf_fget_light(group_fd, &fput_needed);
 		if (IS_ERR(group_leader)) {
 			err = PTR_ERR(group_leader);
-			goto err_alloc;
+			goto err_fd;
 		}
 		group_file = group_leader->filp;
 		if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5569,6 +5563,12 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
+	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
+	if (IS_ERR(event)) {
+		err = PTR_ERR(event);
+		goto err_fd;
+	}
+
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -5653,7 +5653,6 @@ err_context:
 	put_ctx(ctx);
 err_group_fd:
 	fput_light(group_file, fput_needed);
-err_alloc:
 	free_event(event);
 err_fd:
 	put_unused_fd(event_fd);
-- 
cgit v1.2.3-70-g09d2


From b04243ef7006cda301819f54ee7ce0a3632489e3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 17 Sep 2010 11:28:48 +0200
Subject: perf: Complete software pmu grouping

Aside from allowing software events into a !software group,
allow adding !software events to pure software groups.

Once we've moved the software group and attached the first
!software event, the group will no longer be a pure software
group and hence no longer be eligible for movement, at which
point the straight ctx comparison is correct again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20100917093009.410784731@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  6 +++++
 kernel/perf_event.c        | 65 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 5 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 39d8860b268..165287fd2cc 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -804,12 +804,18 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
+enum perf_event_context_type {
+	task_context,
+	cpu_context,
+};
+
 /**
  * struct perf_event_context - event context structure
  *
  * Used as a container for task events and CPU events as well:
  */
 struct perf_event_context {
+	enum perf_event_context_type	type;
 	struct pmu			*pmu;
 	/*
 	 * Protect the states of the events in the list,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ce95617f5d2..6d7eef5f3c4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5184,6 +5184,7 @@ int perf_pmu_register(struct pmu *pmu)
 
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		__perf_event_init_context(&cpuctx->ctx);
+		cpuctx->ctx.type = cpu_context;
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->timer_interval = TICK_NSEC;
 		hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -5517,7 +5518,8 @@ SYSCALL_DEFINE5(perf_event_open,
 		struct perf_event_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-	struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+	struct perf_event *group_leader = NULL, *output_event = NULL;
+	struct perf_event *event, *sibling;
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
@@ -5525,6 +5527,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	struct task_struct *task = NULL;
 	struct pmu *pmu;
 	int event_fd;
+	int move_group = 0;
 	int fput_needed = 0;
 	int err;
 
@@ -5574,8 +5577,29 @@ SYSCALL_DEFINE5(perf_event_open,
 	 * any hardware group.
 	 */
 	pmu = event->pmu;
-	if ((pmu->task_ctx_nr == perf_sw_context) && group_leader)
-		pmu = group_leader->pmu;
+
+	if (group_leader &&
+	    (is_software_event(event) != is_software_event(group_leader))) {
+		if (is_software_event(event)) {
+			/*
+			 * If event and group_leader are not both a software
+			 * event, and event is, then group leader is not.
+			 *
+			 * Allow the addition of software events to !software
+			 * groups, this is safe because software events never
+			 * fail to schedule.
+			 */
+			pmu = group_leader->pmu;
+		} else if (is_software_event(group_leader) &&
+			   (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+			/*
+			 * In case the group is a pure software group, and we
+			 * try to add a hardware event, move the whole group to
+			 * the hardware context.
+			 */
+			move_group = 1;
+		}
+	}
 
 	if (pid != -1)
 		task = find_lively_task_by_vpid(pid);
@@ -5605,8 +5629,14 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
 		 */
-		if (group_leader->ctx != ctx)
-			goto err_context;
+		if (move_group) {
+			if (group_leader->ctx->type != ctx->type)
+				goto err_context;
+		} else {
+			if (group_leader->ctx != ctx)
+				goto err_context;
+		}
+
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
@@ -5626,9 +5656,34 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_context;
 	}
 
+	if (move_group) {
+		struct perf_event_context *gctx = group_leader->ctx;
+
+		mutex_lock(&gctx->mutex);
+		perf_event_remove_from_context(group_leader);
+		list_for_each_entry(sibling, &group_leader->sibling_list,
+				    group_entry) {
+			perf_event_remove_from_context(sibling);
+			put_ctx(gctx);
+		}
+		mutex_unlock(&gctx->mutex);
+		put_ctx(gctx);
+	}
+
 	event->filp = event_file;
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
+
+	if (move_group) {
+		perf_install_in_context(ctx, group_leader, cpu);
+		get_ctx(ctx);
+		list_for_each_entry(sibling, &group_leader->sibling_list,
+				    group_entry) {
+			perf_install_in_context(ctx, sibling, cpu);
+			get_ctx(ctx);
+		}
+	}
+
 	perf_install_in_context(ctx, event, cpu);
 	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
-- 
cgit v1.2.3-70-g09d2


From 917bdd1c9b7b0f4c22f2504c2f0c1074c8ab9df7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 17 Sep 2010 11:28:49 +0200
Subject: perf: Fix perf_event_exit_cpu_context()

Use the right cpu-context.. spotted by preempt warning on
hot-unplug

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <20100917093009.461794357@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6d7eef5f3c4..27332e5f51a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6269,14 +6269,13 @@ static void perf_event_exit_cpu_context(int cpu)
 
 	idx = srcu_read_lock(&pmus_srcu);
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		ctx = &this_cpu_ptr(pmu->pmu_cpu_context)->ctx;
+		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
 
 		mutex_lock(&ctx->mutex);
 		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
 		mutex_unlock(&ctx->mutex);
 	}
 	srcu_read_unlock(&pmus_srcu, idx);
-
 }
 
 static void perf_event_exit_cpu(int cpu)
-- 
cgit v1.2.3-70-g09d2


From e9d2b064149ff7ef4acbc65a1b9374ac8b218d3e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 17 Sep 2010 11:28:50 +0200
Subject: perf: Undo the per cpu-context timer stuff

Revert the timer per cpu-context timers because of unfortunate
nohz interaction. Fixing that would have been somewhat ugly, so
go back to driving things from the regular tick. Provide a
jiffies interval feature for people who want slower rotations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20100917093009.519845633@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  6 ++--
 kernel/perf_event.c        | 79 ++++++++++++++++++++++++++++------------------
 kernel/sched.c             |  2 ++
 3 files changed, 55 insertions(+), 32 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 165287fd2cc..61b1e2d760f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -870,8 +870,8 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
-	u64				timer_interval;
-	struct hrtimer			timer;
+	struct list_head		rotation_list;
+	int				jiffies_interval;
 };
 
 struct perf_output_handle {
@@ -1065,6 +1065,7 @@ extern int perf_swevent_get_recursion_context(void);
 extern void perf_swevent_put_recursion_context(int rctx);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
+extern void perf_event_task_tick(void);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task)			{ }
@@ -1099,6 +1100,7 @@ static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
 static inline void perf_swevent_put_recursion_context(int rctx)		{ }
 static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
+static inline void perf_event_task_tick(void)				{ }
 #endif
 
 #define perf_output_put(handle, x) \
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 27332e5f51a..baae1367e94 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -77,23 +77,22 @@ void perf_pmu_enable(struct pmu *pmu)
 		pmu->pmu_enable(pmu);
 }
 
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
 static void perf_pmu_rotate_start(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+	struct list_head *head = &__get_cpu_var(rotation_list);
 
-	if (hrtimer_active(&cpuctx->timer))
-		return;
+	WARN_ON(!irqs_disabled());
 
-	__hrtimer_start_range_ns(&cpuctx->timer,
-			ns_to_ktime(cpuctx->timer_interval), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-	hrtimer_cancel(&cpuctx->timer);
+	if (list_empty(&cpuctx->rotation_list))
+		list_add(&cpuctx->rotation_list, head);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -1607,36 +1606,33 @@ static void rotate_ctx(struct perf_event_context *ctx)
 }
 
 /*
- * Cannot race with ->pmu_rotate_start() because this is ran from hardirq
- * context, and ->pmu_rotate_start() is called with irqs disabled (both are
- * cpu affine, so there are no SMP races).
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
  */
-static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
-	enum hrtimer_restart restart = HRTIMER_NORESTART;
-	struct perf_cpu_context *cpuctx;
+	u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
 	struct perf_event_context *ctx = NULL;
-	int rotate = 0;
-
-	cpuctx = container_of(timer, struct perf_cpu_context, timer);
+	int rotate = 0, remove = 1;
 
 	if (cpuctx->ctx.nr_events) {
-		restart = HRTIMER_RESTART;
+		remove = 0;
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 			rotate = 1;
 	}
 
 	ctx = cpuctx->task_ctx;
 	if (ctx && ctx->nr_events) {
-		restart = HRTIMER_RESTART;
+		remove = 0;
 		if (ctx->nr_events != ctx->nr_active)
 			rotate = 1;
 	}
 
 	perf_pmu_disable(cpuctx->ctx.pmu);
-	perf_ctx_adjust_freq(&cpuctx->ctx, cpuctx->timer_interval);
+	perf_ctx_adjust_freq(&cpuctx->ctx, interval);
 	if (ctx)
-		perf_ctx_adjust_freq(ctx, cpuctx->timer_interval);
+		perf_ctx_adjust_freq(ctx, interval);
 
 	if (!rotate)
 		goto done;
@@ -1654,10 +1650,24 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
 		task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 
 done:
+	if (remove)
+		list_del_init(&cpuctx->rotation_list);
+
 	perf_pmu_enable(cpuctx->ctx.pmu);
-	hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval));
+}
+
+void perf_event_task_tick(void)
+{
+	struct list_head *head = &__get_cpu_var(rotation_list);
+	struct perf_cpu_context *cpuctx, *tmp;
 
-	return restart;
+	WARN_ON(!irqs_disabled());
+
+	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+		if (cpuctx->jiffies_interval == 1 ||
+				!(jiffies % cpuctx->jiffies_interval))
+			perf_rotate_context(cpuctx);
+	}
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -5186,9 +5196,8 @@ int perf_pmu_register(struct pmu *pmu)
 		__perf_event_init_context(&cpuctx->ctx);
 		cpuctx->ctx.type = cpu_context;
 		cpuctx->ctx.pmu = pmu;
-		cpuctx->timer_interval = TICK_NSEC;
-		hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		cpuctx->timer.function = perf_event_context_tick;
+		cpuctx->jiffies_interval = 1;
+		INIT_LIST_HEAD(&cpuctx->rotation_list);
 	}
 
 got_cpu_context:
@@ -6229,6 +6238,7 @@ static void __init perf_event_init_all_cpus(void)
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
+		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
 	}
 }
 
@@ -6248,6 +6258,15 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+static void perf_pmu_rotate_stop(struct pmu *pmu)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+	WARN_ON(!irqs_disabled());
+
+	list_del_init(&cpuctx->rotation_list);
+}
+
 static void __perf_event_exit_context(void *__info)
 {
 	struct perf_event_context *ctx = __info;
diff --git a/kernel/sched.c b/kernel/sched.c
index 1c3ea7a55b7..794819eab9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3584,6 +3584,8 @@ void scheduler_tick(void)
 	curr->sched_class->task_tick(rq, curr, 0);
 	raw_spin_unlock(&rq->lock);
 
+	perf_event_task_tick();
+
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
-- 
cgit v1.2.3-70-g09d2


From 41945f6ccf1e86f87fddf6b32db9cf431c05fb54 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 16 Sep 2010 19:17:24 +0200
Subject: perf: Avoid RCU vs preemption assumptions

The per-pmu per-cpu context patch converted things from
get_cpu_var() to this_cpu_ptr(), but that only works if
rcu_read_lock() actually disables preemption, and since
there is no such guarantee, we need to fix that.

Use the newly introduced {get,put}_cpu_ptr().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <20100917093009.308453028@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index baae1367e94..c16158c77df 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3836,18 +3836,20 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_task_ctx(&cpuctx->ctx, task_event);
 
 		ctx = task_event->task_ctx;
 		if (!ctx) {
 			ctxn = pmu->task_ctx_nr;
 			if (ctxn < 0)
-				continue;
+				goto next;
 			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		}
 		if (ctx)
 			perf_event_task_ctx(ctx, task_event);
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
 	rcu_read_unlock();
 }
@@ -3969,16 +3971,18 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 
 		ctxn = pmu->task_ctx_nr;
 		if (ctxn < 0)
-			continue;
+			goto next;
 
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
 			perf_event_comm_ctx(ctx, comm_event);
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
 	rcu_read_unlock();
 }
@@ -4152,19 +4156,21 @@ got_name:
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
 		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
 					vma->vm_flags & VM_EXEC);
 
 		ctxn = pmu->task_ctx_nr;
 		if (ctxn < 0)
-			continue;
+			goto next;
 
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx) {
 			perf_event_mmap_ctx(ctx, mmap_event,
 					vma->vm_flags & VM_EXEC);
 		}
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.2.3-70-g09d2


From 540804b5c52065a87d826f7714b18a3ec0b269f9 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 4 Oct 2010 12:00:02 +0200
Subject: perf_events: Fix invalid pointer when pid is invalid

This patch fixes an error in perf_event_open() when the pid
provided by the user is invalid. find_lively_task_by_vpid()
does not return NULL on error but an error code. Without the
fix the error code was silently passed to find_get_context()
which would eventually cause a invalid pointer dereference.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: perfmon2-devel@lists.sf.net
Cc: eranian@gmail.com
Cc: robert.richter@amd.com
LKML-Reference: <4ca9a5d1.e8e9d80a.3dbb.ffff8f2e@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c16158c77df..64507eaa2d9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5616,8 +5616,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
-	if (pid != -1)
+	if (pid != -1) {
 		task = find_lively_task_by_vpid(pid);
+		if (IS_ERR(task)) {
+			err = PTR_ERR(task);
+			goto err_group_fd;
+		}
+	}
 
 	/*
 	 * Get the target context (task or percpu):
-- 
cgit v1.2.3-70-g09d2


From 84c7991059c9c4530cc911137c5bf508a41ed129 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@console-pimps.org>
Date: Sun, 3 Oct 2010 21:41:13 +0100
Subject: perf: New helper function for pmu name

Introduce perf_pmu_name() helper function that returns the name of the
pmu. This gives us a generic way to get the name of a pmu regardless of
how an architecture identifies it internally.

Signed-off-by: Matt Fleming <matt@console-pimps.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/sh/kernel/perf_event.c | 9 +++++++++
 include/linux/perf_event.h  | 1 +
 kernel/perf_event.c         | 5 +++++
 3 files changed, 15 insertions(+)

(limited to 'kernel/perf_event.c')

diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 2cb9ad59d4b..55fe89bbdfe 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -59,6 +59,15 @@ static inline int sh_pmu_initialized(void)
 	return !!sh_pmu;
 }
 
+const char *perf_pmu_name(void)
+{
+	if (!sh_pmu)
+		return NULL;
+
+	return sh_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
 int perf_num_counters(void)
 {
 	if (!sh_pmu)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a021924718..33f08dafda2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -850,6 +850,7 @@ extern int perf_max_events;
 extern const struct pmu *hw_perf_event_init(struct perf_event *event);
 
 extern int perf_num_counters(void);
+extern const char *perf_pmu_name(void);
 extern void perf_event_task_sched_in(struct task_struct *task);
 extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
 extern void perf_event_task_tick(struct task_struct *task);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 403d1804b19..e2534691db0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -85,6 +85,11 @@ void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak perf_event_print_debug(void)	{ }
 
+extern __weak const char *perf_pmu_name(void)
+{
+	return "pmu";
+}
+
 static DEFINE_PER_CPU(int, perf_disable_count);
 
 void perf_disable(void)
-- 
cgit v1.2.3-70-g09d2


From ad0cf3478de8677f720ee06393b3147819568d6a Mon Sep 17 00:00:00 2001
From: John Blackwood <john.blackwood@ccur.com>
Date: Tue, 28 Sep 2010 18:03:11 -0400
Subject: perf: Fix incorrect copy_from_user() usage

perf events: repair incorrect use of copy_from_user

This makes the perf_event_period() return 0 instead of
-EFAULT on success.

Signed-off-by: John Blackwood<john.blackwood@ccur.com>
Signed-off-by: Joe Korty <joe.korty@ccur.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100928220311.GA18145@tsunami.ccur.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index db5b5606468..b98bed3d818 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2202,15 +2202,13 @@ static void perf_event_for_each(struct perf_event *event,
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
 	struct perf_event_context *ctx = event->ctx;
-	unsigned long size;
 	int ret = 0;
 	u64 value;
 
 	if (!event->attr.sample_period)
 		return -EINVAL;
 
-	size = copy_from_user(&value, arg, sizeof(value));
-	if (size != sizeof(value))
+	if (copy_from_user(&value, arg, sizeof(value)))
 		return -EFAULT;
 
 	if (!value)
-- 
cgit v1.2.3-70-g09d2


From c530ccd9a1864a44a7ff35826681229ce9f2357a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 15 Oct 2010 15:26:01 +0200
Subject: perf_events: Fix bogus context time tracking

You can only call update_context_time() when the context
is active, i.e., the thread it is attached to is still running.

However, perf_event_read() can be called even when the context
is inactive, e.g., user read() the counters. The call to
update_context_time() must be conditioned on the status of
the context, otherwise, bogus time_enabled, time_running may
be returned. Here is an example on AMD64. The task program
is an example from libpfm4. The -p prints deltas every 1s.

$ task -p -e cpu_clk_unhalted sleep 5
    2,266,610 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982)
	    0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982)
	    0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982)
	    0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982)
	    0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982)
5,242,358,071 cpu_clk_unhalted (99.95% scaling, ena=5,000,359,984, run=2,319,270)

Whereas if you don't read deltas, e.g., no call to perf_event_read() until
the process terminates:

$ task -e cpu_clk_unhalted sleep 5
    2,497,783 cpu_clk_unhalted (0.00% scaling, ena=2,376,899, run=2,376,899)

Notice that time_enable, time_running are bogus in the first example
causing bogus scaling.

This patch fixes the problem, by conditionally calling update_context_time()
in perf_event_read().

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <4cb856dc.51edd80a.5ae0.38fb@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1ec3916ffef..e7eeba1794f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1780,7 +1780,13 @@ static u64 perf_event_read(struct perf_event *event)
 		unsigned long flags;
 
 		raw_spin_lock_irqsave(&ctx->lock, flags);
-		update_context_time(ctx);
+		/*
+		 * may read while context is not active
+		 * (e.g., thread is blocked), in that case
+		 * we cannot update context time
+		 */
+		if (ctx->is_active)
+			update_context_time(ctx);
 		update_event_times(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
-- 
cgit v1.2.3-70-g09d2


From 8e5fc1a7320baf6076391607515dceb61319b36a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 15 Oct 2010 16:54:01 +0200
Subject: perf_events: Fix transaction recovery in group_sched_in()

The group_sched_in() function uses a transactional approach to schedule
a group of events. In a group, either all events can be scheduled or
none are. To schedule each event in, the function calls event_sched_in().
In case of error, event_sched_out() is called on each event in the group.

The problem is that event_sched_out() does not completely cancel the
effects of event_sched_in(). Furthermore event_sched_out() changes the
state of the event as if it had run which is not true is this particular
case.

Those inconsistencies impact time tracking fields and may lead to events
in a group not all reporting the same time_enabled and time_running values.
This is demonstrated with the example below:

$ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5
1946101 unhalted_core_cycles (32.85% scaling, ena=829181, run=556827)
  11423 baclears (32.85% scaling, ena=829181, run=556827)
   7671 baclears (0.00% scaling, ena=556827, run=556827)

2250443 unhalted_core_cycles (57.83% scaling, ena=962822, run=405995)
  11705 baclears (57.83% scaling, ena=962822, run=405995)
  11705 baclears (57.83% scaling, ena=962822, run=405995)

Notice that in the first group, the last baclears event does not
report the same timings as its siblings.

This issue comes from the fact that tstamp_stopped is updated
by event_sched_out() as if the event had actually run.

To solve the issue, we must ensure that, in case of error, there is
no change in the event state whatsoever. That means timings must
remain as they were when entering group_sched_in().

To do this we defer updating tstamp_running until we know the
transaction succeeded. Therefore, we have split event_sched_in()
in two parts separating the update to tstamp_running.

Similarly, in case of error, we do not want to update tstamp_stopped.
Therefore, we have split event_sched_out() in two parts separating
the update to tstamp_stopped.

With this patch, we now get the following output:

$ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5
2492050 unhalted_core_cycles (71.75% scaling, ena=1093330, run=308841)
  11243 baclears (71.75% scaling, ena=1093330, run=308841)
  11243 baclears (71.75% scaling, ena=1093330, run=308841)

1852746 unhalted_core_cycles (0.00% scaling, ena=784489, run=784489)
   9253 baclears (0.00% scaling, ena=784489, run=784489)
   9253 baclears (0.00% scaling, ena=784489, run=784489)

Note that the uneven timing between groups is a side effect of
the process spending most of its time sleeping, i.e., not enough
event rotations (but that's a separate issue).

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4cb86b4c.41e9d80a.44e9.3e19@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 76 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 13 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e7eeba1794f..634f86a4b2f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -412,8 +412,8 @@ event_filter_match(struct perf_event *event)
 	return event->cpu == -1 || event->cpu == smp_processor_id();
 }
 
-static void
-event_sched_out(struct perf_event *event,
+static int
+__event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
@@ -432,14 +432,13 @@ event_sched_out(struct perf_event *event,
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
-		return;
+		return 0;
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = ctx->time;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
@@ -448,6 +447,19 @@ event_sched_out(struct perf_event *event,
 	ctx->nr_active--;
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
+	return 1;
+}
+
+static void
+event_sched_out(struct perf_event *event,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_event_context *ctx)
+{
+	int ret;
+
+	ret = __event_sched_out(event, cpuctx, ctx);
+	if (ret)
+		event->tstamp_stopped = ctx->time;
 }
 
 static void
@@ -647,7 +659,7 @@ retry:
 }
 
 static int
-event_sched_in(struct perf_event *event,
+__event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
@@ -667,8 +679,6 @@ event_sched_in(struct perf_event *event,
 		return -EAGAIN;
 	}
 
-	event->tstamp_running += ctx->time - event->tstamp_stopped;
-
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -679,6 +689,35 @@ event_sched_in(struct perf_event *event,
 	return 0;
 }
 
+static inline int
+event_sched_in(struct perf_event *event,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_event_context *ctx)
+{
+	int ret = __event_sched_in(event, cpuctx, ctx);
+	if (ret)
+		return ret;
+	event->tstamp_running += ctx->time - event->tstamp_stopped;
+	return 0;
+}
+
+static void
+group_commit_event_sched_in(struct perf_event *group_event,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_event_context *ctx)
+{
+	struct perf_event *event;
+	u64 now = ctx->time;
+
+	group_event->tstamp_running += now - group_event->tstamp_stopped;
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+		event->tstamp_running += now - event->tstamp_stopped;
+	}
+}
+
 static int
 group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
@@ -692,7 +731,13 @@ group_sched_in(struct perf_event *group_event,
 
 	pmu->start_txn(pmu);
 
-	if (event_sched_in(group_event, cpuctx, ctx)) {
+	/*
+	 * use __event_sched_in() to delay updating tstamp_running
+	 * until the transaction is committed. In case of failure
+	 * we will keep an unmodified tstamp_running which is a
+	 * requirement to get correct timing information
+	 */
+	if (__event_sched_in(group_event, cpuctx, ctx)) {
 		pmu->cancel_txn(pmu);
 		return -EAGAIN;
 	}
@@ -701,26 +746,31 @@ group_sched_in(struct perf_event *group_event,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-		if (event_sched_in(event, cpuctx, ctx)) {
+		if (__event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
 		}
 	}
 
-	if (!pmu->commit_txn(pmu))
+	if (!pmu->commit_txn(pmu)) {
+		/* commit tstamp_running */
+		group_commit_event_sched_in(group_event, cpuctx, ctx);
 		return 0;
-
+	}
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
+	 *
+	 * use __event_sched_out() to avoid updating tstamp_stopped
+	 * because the event never actually ran
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 		if (event == partial_group)
 			break;
-		event_sched_out(event, cpuctx, ctx);
+		__event_sched_out(event, cpuctx, ctx);
 	}
-	event_sched_out(group_event, cpuctx, ctx);
+	__event_sched_out(group_event, cpuctx, ctx);
 
 	pmu->cancel_txn(pmu);
 
-- 
cgit v1.2.3-70-g09d2


From e360adbe29241a0194e10e20595360dd7b98a2b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 14:01:34 +0800
Subject: irq_work: Add generic hardirq context callbacks

Provide a mechanism that allows running code in IRQ context. It is
most useful for NMI code that needs to interact with the rest of the
system -- like wakeup a task to drain buffers.

Perf currently has such a mechanism, so extract that and provide it as
a generic feature, independent of perf so that others may also
benefit.

The IRQ context callback is generated through self-IPIs where
possible, or on architectures like powerpc the decrementer (the
built-in timer facility) is set to generate an interrupt immediately.

Architectures that don't have anything like this get to do with a
callback from the timer tick. These architectures can call
irq_work_run() at the tail of any IRQ handlers that might enqueue such
work (like the perf IRQ handler) to avoid undue latencies in
processing the work.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
[ various fixes ]
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1287036094.7768.291.camel@yhuang-dev>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/Kconfig                   |   1 +
 arch/alpha/include/asm/perf_event.h  |   5 --
 arch/alpha/kernel/time.c             |  30 +++----
 arch/arm/Kconfig                     |   1 +
 arch/arm/include/asm/perf_event.h    |  12 ---
 arch/arm/kernel/perf_event.c         |   8 +-
 arch/frv/Kconfig                     |   1 +
 arch/frv/lib/Makefile                |   2 +-
 arch/frv/lib/perf_event.c            |  19 ----
 arch/parisc/Kconfig                  |   1 +
 arch/parisc/include/asm/perf_event.h |   3 +-
 arch/powerpc/Kconfig                 |   1 +
 arch/powerpc/include/asm/paca.h      |   2 +-
 arch/powerpc/kernel/time.c           |  42 ++++-----
 arch/s390/Kconfig                    |   1 +
 arch/s390/include/asm/perf_event.h   |   3 +-
 arch/sh/Kconfig                      |   1 +
 arch/sh/include/asm/perf_event.h     |   7 --
 arch/sparc/Kconfig                   |   2 +
 arch/sparc/include/asm/perf_event.h  |   4 -
 arch/sparc/kernel/pcr.c              |   8 +-
 arch/x86/Kconfig                     |   1 +
 arch/x86/include/asm/entry_arch.h    |   4 +-
 arch/x86/include/asm/hardirq.h       |   2 +-
 arch/x86/include/asm/hw_irq.h        |   2 +-
 arch/x86/include/asm/irq_vectors.h   |   4 +-
 arch/x86/kernel/Makefile             |   1 +
 arch/x86/kernel/cpu/perf_event.c     |  19 ----
 arch/x86/kernel/entry_64.S           |   6 +-
 arch/x86/kernel/irq.c                |   8 +-
 arch/x86/kernel/irq_work.c           |  30 +++++++
 arch/x86/kernel/irqinit.c            |   6 +-
 include/linux/irq_work.h             |  20 +++++
 include/linux/perf_event.h           |  11 +--
 init/Kconfig                         |   8 ++
 kernel/Makefile                      |   2 +
 kernel/irq_work.c                    | 164 +++++++++++++++++++++++++++++++++++
 kernel/perf_event.c                  | 104 ++--------------------
 kernel/timer.c                       |   7 +-
 39 files changed, 311 insertions(+), 242 deletions(-)
 delete mode 100644 arch/frv/lib/perf_event.c
 create mode 100644 arch/x86/kernel/irq_work.c
 create mode 100644 include/linux/irq_work.h
 create mode 100644 kernel/irq_work.c

(limited to 'kernel/perf_event.c')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b9647bb66d1..d04ccd73af4 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -9,6 +9,7 @@ config ALPHA
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS
 	help
diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h
index 4157cd3c44a..fe792ca818f 100644
--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -1,11 +1,6 @@
 #ifndef __ASM_ALPHA_PERF_EVENT_H
 #define __ASM_ALPHA_PERF_EVENT_H
 
-/* Alpha only supports software events through this interface. */
-extern void set_perf_event_pending(void);
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
 #ifdef CONFIG_PERF_EVENTS
 extern void init_hw_perf_events(void);
 #else
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 396af1799ea..0f1d8493cfc 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -41,7 +41,7 @@
 #include <linux/init.h>
 #include <linux/bcd.h>
 #include <linux/profile.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -83,25 +83,25 @@ static struct {
 
 unsigned long est_cycle_freq;
 
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
 
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
 
-#define set_perf_event_pending_flag()  __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()      __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()     __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()  __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()      __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()     __get_cpu_var(irq_work_pending) = 0
 
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
 {
-	set_perf_event_pending_flag();
+	set_irq_work_pending_flag();
 }
 
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
 
-#define test_perf_event_pending()      0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()      0
+#define clear_irq_work_pending()
 
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
 
 
 static inline __u32 rpcc(void)
@@ -191,9 +191,9 @@ irqreturn_t timer_interrupt(int irq, void *dev)
 
 	write_sequnlock(&xtime_lock);
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
+	if (test_irq_work_pending()) {
+		clear_irq_work_pending();
+		irq_work_run();
 	}
 
 #ifndef CONFIG_SMP
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 88c97bc7a6f..7c0dfccd05b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -23,6 +23,7 @@ config ARM
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index b5799a3b711..c4aa4e8c6af 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts disabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
 /* ARM performance counters start from 1 (in the cp15 accesses) so use the
  * same indexes here for consistency. */
 #define PERF_EVENT_INDEX_OFFSET 1
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 6cc6521881a..49643b1467e 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -1092,7 +1092,7 @@ armv6pmu_handle_irq(int irq_num,
 	 * platforms that can have the PMU interrupts raised as an NMI, this
 	 * will not work.
 	 */
-	perf_event_do_pending();
+	irq_work_run();
 
 	return IRQ_HANDLED;
 }
@@ -2068,7 +2068,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 * platforms that can have the PMU interrupts raised as an NMI, this
 	 * will not work.
 	 */
-	perf_event_do_pending();
+	irq_work_run();
 
 	return IRQ_HANDLED;
 }
@@ -2436,7 +2436,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
 			armpmu->disable(hwc, idx);
 	}
 
-	perf_event_do_pending();
+	irq_work_run();
 
 	/*
 	 * Re-enable the PMU.
@@ -2763,7 +2763,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
 			armpmu->disable(hwc, idx);
 	}
 
-	perf_event_do_pending();
+	irq_work_run();
 
 	/*
 	 * Re-enable the PMU.
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 16399bd2499..0f2417df632 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
 	default y
 	select HAVE_IDE
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 
 config ZONE_DMA
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile
index f4709756d0d..4ff2fb1e6b1 100644
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
 lib-y := \
 	__ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
 	checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
-	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
+	outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o
diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c
deleted file mode 100644
index 9ac5acfd2e9..00000000000
--- a/arch/frv/lib/perf_event.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance event handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_event.h>
-
-/*
- * mark the performance event as pending
- */
-void set_perf_event_pending(void)
-{
-}
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 907417d187e..79a04a9394d 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,6 +16,7 @@ config PARISC
 	select RTC_DRV_GENERIC
 	select INIT_ALL_POSSIBLE
 	select BUG
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
 	help
diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h
index cc146427d8f..1e0fd8ba6c0 100644
--- a/arch/parisc/include/asm/perf_event.h
+++ b/arch/parisc/include/asm/perf_event.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_PARISC_PERF_EVENT_H
 #define __ASM_PARISC_PERF_EVENT_H
 
-/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
+/* Empty, just to avoid compiling error */
 
 #endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 631e5a0fb6a..4b1e521d966 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
 	select HAVE_OPROFILE
 	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 1ff6662f7fa..9b287fdd8ea 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -129,7 +129,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
-	u8 perf_event_pending;		/* PM interrupt while soft-disabled */
+	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 8533b3b83f5..54888eb10c3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <asm/trace.h>
 
 #include <asm/io.h>
@@ -493,60 +493,60 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
 
 /*
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
+static inline unsigned long test_irq_work_pending(void)
 {
 	unsigned long x;
 
 	asm volatile("lbz %0,%1(13)"
 		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, perf_event_pending)));
+		: "i" (offsetof(struct paca_struct, irq_work_pending)));
 	return x;
 }
 
-static inline void set_perf_event_pending_flag(void)
+static inline void set_irq_work_pending_flag(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (1),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
-static inline void clear_perf_event_pending(void)
+static inline void clear_irq_work_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
 		"r" (0),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
+		"i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
 #else /* 32-bit */
 
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
 
-#define set_perf_event_pending_flag()	__get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()	__get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()	__get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()	__get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()		__get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()	__get_cpu_var(irq_work_pending) = 0
 
 #endif /* 32 vs 64 bit */
 
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
 {
 	preempt_disable();
-	set_perf_event_pending_flag();
+	set_irq_work_pending_flag();
 	set_dec(1);
 	preempt_enable();
 }
 
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
 
-#define test_perf_event_pending()	0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()	0
+#define clear_irq_work_pending()
 
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
 
 /*
  * For iSeries shared processors, we have to let the hypervisor
@@ -587,9 +587,9 @@ void timer_interrupt(struct pt_regs * regs)
 
 	calculate_steal_time();
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
+	if (test_irq_work_pending()) {
+		clear_irq_work_pending();
+		irq_work_run();
 	}
 
 #ifdef CONFIG_PPC_ISERIES
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f0777a47e3a..958f0dadead 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -95,6 +95,7 @@ config S390
 	select HAVE_KVM if 64BIT
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 3840cbe7763..a75f168d271 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -4,7 +4,6 @@
  * Copyright 2009 Martin Schwidefsky, IBM Corporation.
  */
 
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
+/* Empty, just to avoid compiling error */
 
 #define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 35b6c3f8517..35b6879628a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h
index 3d0c9f36d15..14308bed7ea 100644
--- a/arch/sh/include/asm/perf_event.h
+++ b/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu *);
 extern int reserve_pmc_hardware(void);
 extern void release_pmc_hardware(void);
 
-static inline void set_perf_event_pending(void)
-{
-	/* Nothing to see here, move along. */
-}
-
-#define PERF_EVENT_INDEX_OFFSET	0
-
 #endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 9212cd42a83..3e9d31401fb 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,6 +26,7 @@ config SPARC
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select RTC_CLASS
 	select RTC_DRV_M48T59
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_DMA_ATTRS
@@ -54,6 +55,7 @@ config SPARC64
 	select RTC_DRV_BQ4802
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
+	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h
index 727af70646c..6e8bfa1786d 100644
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -1,10 +1,6 @@
 #ifndef __ASM_SPARC_PERF_EVENT_H
 #define __ASM_SPARC_PERF_EVENT_H
 
-extern void set_perf_event_pending(void);
-
-#define	PERF_EVENT_INDEX_OFFSET	0
-
 #ifdef CONFIG_PERF_EVENTS
 #include <asm/ptrace.h>
 
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index c4a6a50b484..b87873c0e8e 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/ftrace.h>
 
 #include <asm/pil.h>
@@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
-#ifdef CONFIG_PERF_EVENTS
-	perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+	irq_work_run();
 #endif
 	irq_exit();
 	set_irq_regs(old_regs);
 }
 
-void set_perf_event_pending(void)
+void arch_irq_work_raise(void)
 {
 	set_softint(1 << PIL_DEFERRED_PCR_WORK);
 }
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9815221976a..fd227d6b8d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_PERF_EVENTS if (!M386 && !M486)
+	select HAVE_IRQ_WORK
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98..b8e96a18676 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee61..55e4de613f0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
-	unsigned int apic_pending_irqs;
+	unsigned int apic_irq_work_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f23..3a54a1ca1a0 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
 extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e2ca3009255..6af0894dafb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
 #define X86_PLATFORM_IPI_VECTOR		0xed
 
 /*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
  */
-#define LOCAL_PENDING_VECTOR		0xec
+#define IRQ_WORK_VECTOR			0xec
 
 #define UV_BAU_MESSAGE			0xea
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9d3f485e5dd..7490bf8d145 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,6 +35,7 @@ obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e2513f26ba8..fe73c1844a9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1196,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 	return handled;
 }
 
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_event_do_pending();
-	irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
 void perf_events_lapic_init(void)
 {
 	if (!x86_pmu.apic || !x86_pmu_initialized())
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbb..c375c79065f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
-	perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18..44edb03fc9e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance monitoring interrupts\n");
-	seq_printf(p, "%*s: ", prec, "PND");
+	seq_printf(p, "%*s: ", prec, "IWI");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
-	seq_printf(p, "  Performance pending work\n");
+		seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+	seq_printf(p, "  IRQ work interrupts\n");
 #endif
 	if (x86_platform_ipi_callback) {
 		seq_printf(p, "%*s: ", prec, "PLT");
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
-	sum += irq_stats(cpu)->apic_pending_irqs;
+	sum += irq_stats(cpu)->apic_irq_work_irqs;
 #endif
 	if (x86_platform_ipi_callback)
 		sum += irq_stats(cpu)->x86_platform_ipis;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 00000000000..ca8f703a1e7
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_work_run();
+	irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!cpu_has_apic)
+		return;
+
+	apic->send_IPI_self(IRQ_WORK_VECTOR);
+	apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc57..713969b9266 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -224,9 +224,9 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
-	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+	/* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+	alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
 # endif
 
 #endif
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
new file mode 100644
index 00000000000..4fa09d4d0b7
--- /dev/null
+++ b/include/linux/irq_work.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_IRQ_WORK_H
+#define _LINUX_IRQ_WORK_H
+
+struct irq_work {
+	struct irq_work *next;
+	void (*func)(struct irq_work *);
+};
+
+static inline
+void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+	entry->next = NULL;
+	entry->func = func;
+}
+
+bool irq_work_queue(struct irq_work *entry);
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a9227e98520..2ebfc9ae475 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -486,6 +486,7 @@ struct perf_guest_info_callbacks {
 #include <linux/workqueue.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/irq_work.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
 
@@ -672,11 +673,6 @@ struct perf_buffer {
 	void				*data_pages[0];
 };
 
-struct perf_pending_entry {
-	struct perf_pending_entry *next;
-	void (*func)(struct perf_pending_entry *);
-};
-
 struct perf_sample_data;
 
 typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -784,7 +780,7 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
-	struct perf_pending_entry	pending;
+	struct irq_work			pending;
 
 	atomic_t			event_limit;
 
@@ -898,8 +894,6 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -1078,7 +1072,6 @@ static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
-static inline void perf_event_do_pending(void)				{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd..1ef0b439908 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -21,6 +21,13 @@ config CONSTRUCTORS
 	depends on !UML
 	default y
 
+config HAVE_IRQ_WORK
+	bool
+
+config IRQ_WORK
+	bool
+	depends on HAVE_IRQ_WORK
+
 menu "General setup"
 
 config EXPERIMENTAL
@@ -987,6 +994,7 @@ config PERF_EVENTS
 	default y if (PROFILING || PERF_COUNTERS)
 	depends on HAVE_PERF_EVENTS
 	select ANON_INODES
+	select IRQ_WORK
 	help
 	  Enable kernel support for various performance events provided
 	  by software and hardware.
diff --git a/kernel/Makefile b/kernel/Makefile
index d52b473c99a..4d9bf5f8531 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 00000000000..f16763ff848
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+
+/*
+ * An entry can be in one of four states:
+ *
+ * free	     NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+
+#define IRQ_WORK_PENDING	1UL
+#define IRQ_WORK_BUSY		2UL
+#define IRQ_WORK_FLAGS		3UL
+
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+	return (unsigned long)entry->next & flags;
+}
+
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+	unsigned long next = (unsigned long)entry->next;
+	next &= ~IRQ_WORK_FLAGS;
+	return (struct irq_work *)next;
+}
+
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+	unsigned long next = (unsigned long)entry;
+	next |= flags;
+	return (struct irq_work *)next;
+}
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+	struct irq_work *next, *nflags;
+
+	do {
+		next = entry->next;
+		if ((unsigned long)next & IRQ_WORK_PENDING)
+			return false;
+		nflags = next_flags(next, IRQ_WORK_FLAGS);
+	} while (cmpxchg(&entry->next, next, nflags) != next);
+
+	return true;
+}
+
+
+void __weak arch_irq_work_raise(void)
+{
+	/*
+	 * Lame architectures will get the timer tick callback
+	 */
+}
+
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+	struct irq_work **head, *next;
+
+	head = &get_cpu_var(irq_work_list);
+
+	do {
+		next = *head;
+		/* Can assign non-atomic because we keep the flags set. */
+		entry->next = next_flags(next, IRQ_WORK_FLAGS);
+	} while (cmpxchg(head, next, entry) != next);
+
+	/* The list was empty, raise self-interrupt to start processing. */
+	if (!irq_work_next(entry))
+		arch_irq_work_raise();
+
+	put_cpu_var(irq_work_list);
+}
+
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+	if (!irq_work_claim(entry)) {
+		/*
+		 * Already enqueued, can't do!
+		 */
+		return false;
+	}
+
+	__irq_work_queue(entry);
+	return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+	struct irq_work *list, **head;
+
+	head = &__get_cpu_var(irq_work_list);
+	if (*head == NULL)
+		return;
+
+	BUG_ON(!in_irq());
+	BUG_ON(!irqs_disabled());
+
+	list = xchg(head, NULL);
+	while (list != NULL) {
+		struct irq_work *entry = list;
+
+		list = irq_work_next(list);
+
+		/*
+		 * Clear the PENDING bit, after this point the @entry
+		 * can be re-used.
+		 */
+		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+		entry->func(entry);
+		/*
+		 * Clear the BUSY bit and return to the free state if
+		 * no-one else claimed it meanwhile.
+		 */
+		cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+	}
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+	WARN_ON_ONCE(irqs_disabled());
+
+	while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+		cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 634f86a4b2f..99b9700e74d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2206,12 +2206,11 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void perf_pending_sync(struct perf_event *event);
 static void perf_buffer_put(struct perf_buffer *buffer);
 
 static void free_event(struct perf_event *event)
 {
-	perf_pending_sync(event);
+	irq_work_sync(&event->pending);
 
 	if (!event->parent) {
 		atomic_dec(&nr_events);
@@ -3162,16 +3161,7 @@ void perf_event_wakeup(struct perf_event *event)
 	}
 }
 
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_event(struct perf_pending_entry *entry)
+static void perf_pending_event(struct irq_work *entry)
 {
 	struct perf_event *event = container_of(entry,
 			struct perf_event, pending);
@@ -3187,89 +3177,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
 	}
 }
 
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-	PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-			       void (*func)(struct perf_pending_entry *))
-{
-	struct perf_pending_entry **head;
-
-	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-		return;
-
-	entry->func = func;
-
-	head = &get_cpu_var(perf_pending_head);
-
-	do {
-		entry->next = *head;
-	} while (cmpxchg(head, entry->next, entry) != entry->next);
-
-	set_perf_event_pending();
-
-	put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-	struct perf_pending_entry *list;
-	int nr = 0;
-
-	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-	while (list != PENDING_TAIL) {
-		void (*func)(struct perf_pending_entry *);
-		struct perf_pending_entry *entry = list;
-
-		list = list->next;
-
-		func = entry->func;
-		entry->next = NULL;
-		/*
-		 * Ensure we observe the unqueue before we issue the wakeup,
-		 * so that we won't be waiting forever.
-		 * -- see perf_not_pending().
-		 */
-		smp_wmb();
-
-		func(entry);
-		nr++;
-	}
-
-	return nr;
-}
-
-static inline int perf_not_pending(struct perf_event *event)
-{
-	/*
-	 * If we flush on whatever cpu we run, there is a chance we don't
-	 * need to wait.
-	 */
-	get_cpu();
-	__perf_pending_run();
-	put_cpu();
-
-	/*
-	 * Ensure we see the proper queue state before going to sleep
-	 * so that we do not miss the wakeup. -- see perf_pending_handle()
-	 */
-	smp_rmb();
-	return event->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_event *event)
-{
-	wait_event(event->waitq, perf_not_pending(event));
-}
-
-void perf_event_do_pending(void)
-{
-	__perf_pending_run();
-}
-
 /*
  * We assume there is only KVM supporting the callbacks.
  * Later on, we might change it to a list if there is
@@ -3319,8 +3226,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 
 	if (handle->nmi) {
 		handle->event->pending_wakeup = 1;
-		perf_pending_queue(&handle->event->pending,
-				   perf_pending_event);
+		irq_work_queue(&handle->event->pending);
 	} else
 		perf_event_wakeup(handle->event);
 }
@@ -4356,8 +4262,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 		event->pending_kill = POLL_HUP;
 		if (nmi) {
 			event->pending_disable = 1;
-			perf_pending_queue(&event->pending,
-					   perf_pending_event);
+			irq_work_queue(&event->pending);
 		} else
 			perf_event_disable(event);
 	}
@@ -5374,6 +5279,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
 	init_waitqueue_head(&event->waitq);
+	init_irq_work(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade..68a9ae7679b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
 	run_local_timers();
 	rcu_check_callbacks(cpu, user_tick);
 	printk_tick();
-	perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+	if (in_irq())
+		irq_work_run();
+#endif
 	scheduler_tick();
 	run_posix_cpu_timers(p);
 }
-- 
cgit v1.2.3-70-g09d2


From 74c3337c2fc6389d3a57a622a936036b6db6b2e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 Oct 2010 11:40:29 +0200
Subject: perf: Fix group moving

Matt found we trigger the WARN_ON_ONCE() in perf_group_attach() when we take
the move_group path in perf_event_open().

Since we cannot de-construct the group (we rely on it to move the events), we
have to simply ignore the double attach. The group state is context invariant
and doesn't need changing.

Reported-by: Matt Fleming <matt@console-pimps.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1287135757.29097.1368.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 99b9700e74d..346dc0e35a0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -315,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
 {
 	struct perf_event *group_leader = event->group_leader;
 
-	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+	/*
+	 * We can have double attach due to group movement in perf_event_open.
+	 */
+	if (event->attach_state & PERF_ATTACH_GROUP)
+		return;
+
 	event->attach_state |= PERF_ATTACH_GROUP;
 
 	if (group_leader == event)
-- 
cgit v1.2.3-70-g09d2


From e7d0bc047548d76feee6b23f7d3d9da927189a50 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 16:54:51 +0200
Subject: perf: Fix task refcount bugs

Currently it looks like find_lively_task_by_vpid() takes a task ref
and relies on find_get_context() to drop it.

The problem is that perf_event_create_kernel_counter() shouldn't be
dropping task refs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
LKML-Reference: <20101014203625.278436085@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 346dc0e35a0..f928878a1c1 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2190,11 +2190,9 @@ retry:
 		}
 	}
 
-	put_task_struct(task);
 	return ctx;
 
 errout:
-	put_task_struct(task);
 	return ERR_PTR(err);
 }
 
@@ -5602,7 +5600,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	ctx = find_get_context(pmu, task, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
-		goto err_group_fd;
+		goto err_task;
 	}
 
 	/*
@@ -5698,6 +5696,9 @@ SYSCALL_DEFINE5(perf_event_open,
 
 err_context:
 	put_ctx(ctx);
+err_task:
+	if (task)
+		put_task_struct(task);
 err_group_fd:
 	fput_light(group_file, fput_needed);
 	free_event(event);
-- 
cgit v1.2.3-70-g09d2


From c6be5a5cb62592d9d661899a2aa78236eb00ffa5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 16:59:46 +0200
Subject: perf: Find task before event alloc

So that we can pass the task pointer to the event allocation, so that
we can use task associated data during event initialization.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101014203625.340789919@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f928878a1c1..b21d06aaef6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5551,10 +5551,18 @@ SYSCALL_DEFINE5(perf_event_open,
 			group_leader = NULL;
 	}
 
+	if (pid != -1) {
+		task = find_lively_task_by_vpid(pid);
+		if (IS_ERR(task)) {
+			err = PTR_ERR(task);
+			goto err_group_fd;
+		}
+	}
+
 	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
-		goto err_fd;
+		goto err_task;
 	}
 
 	/*
@@ -5586,21 +5594,13 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
-	if (pid != -1) {
-		task = find_lively_task_by_vpid(pid);
-		if (IS_ERR(task)) {
-			err = PTR_ERR(task);
-			goto err_group_fd;
-		}
-	}
-
 	/*
 	 * Get the target context (task or percpu):
 	 */
 	ctx = find_get_context(pmu, task, cpu);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
-		goto err_task;
+		goto err_alloc;
 	}
 
 	/*
@@ -5696,12 +5696,13 @@ SYSCALL_DEFINE5(perf_event_open,
 
 err_context:
 	put_ctx(ctx);
+err_alloc:
+	free_event(event);
 err_task:
 	if (task)
 		put_task_struct(task);
 err_group_fd:
 	fput_light(group_file, fput_needed);
-	free_event(event);
 err_fd:
 	put_unused_fd(event_fd);
 	return err;
-- 
cgit v1.2.3-70-g09d2


From d580ff8699e8811a9af37e9de4dea375401bdeec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 17:43:23 +0200
Subject: perf, hw_breakpoint: Fix crash in hw_breakpoint creation

hw_breakpoint creation needs to account stuff per-task to ensure there
is always sufficient hardware resources to back these things due to
ptrace.

With the perf per pmu context changes the event initialization no
longer has access to the event context, for the simple reason that we
need to first find the pmu (result of initialization) before we can
find the context.

This makes hw_breakpoints unhappy, because it can no longer do per
task accounting, cure this by frobbing a task pointer in the event::hw
bits for now...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20101014203625.391543667@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  7 +++++++
 kernel/hw_breakpoint.c     |  8 ++++----
 kernel/perf_event.c        | 23 ++++++++++++++++++-----
 3 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2ebfc9ae475..97965fac55f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -536,6 +536,12 @@ struct hw_perf_event {
 		struct { /* breakpoint */
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
+			/*
+			 * Crufty hack to avoid the chicken and egg
+			 * problem hw_breakpoint has with context
+			 * creation and event initalization.
+			 */
+			struct task_struct		*bp_target;
 		};
 #endif
 	};
@@ -693,6 +699,7 @@ struct swevent_hlist {
 
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
+#define PERF_ATTACH_TASK	0x04
 
 /**
  * struct perf_event - performance event kernel representation:
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 3b714e839c1..2c9120f0afc 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
  */
 static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
-	struct perf_event_context *ctx = bp->ctx;
+	struct task_struct *tsk = bp->hw.bp_target;
 	struct perf_event *iter;
 	int count = 0;
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-		if (iter->ctx == ctx && find_slot_idx(iter) == type)
+		if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
 			count += hw_breakpoint_weight(iter);
 	}
 
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		    enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
-	struct task_struct *tsk = bp->ctx->task;
+	struct task_struct *tsk = bp->hw.bp_target;
 
 	if (cpu >= 0) {
 		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 	       int weight)
 {
 	int cpu = bp->cpu;
-	struct task_struct *tsk = bp->ctx->task;
+	struct task_struct *tsk = bp->hw.bp_target;
 
 	/* Pinned counter cpu profiling */
 	if (!tsk) {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b21d06aaef6..856e20baf13 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5255,9 +5255,10 @@ unlock:
  */
 static struct perf_event *
 perf_event_alloc(struct perf_event_attr *attr, int cpu,
-		   struct perf_event *group_leader,
-		   struct perf_event *parent_event,
-		   perf_overflow_handler_t overflow_handler)
+		 struct task_struct *task,
+		 struct perf_event *group_leader,
+		 struct perf_event *parent_event,
+		 perf_overflow_handler_t overflow_handler)
 {
 	struct pmu *pmu;
 	struct perf_event *event;
@@ -5299,6 +5300,17 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (task) {
+		event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		/*
+		 * hw_breakpoint is a bit difficult here..
+		 */
+		if (attr->type == PERF_TYPE_BREAKPOINT)
+			event->hw.bp_target = task;
+#endif
+	}
+
 	if (!overflow_handler && parent_event)
 		overflow_handler = parent_event->overflow_handler;
 	
@@ -5559,7 +5571,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
-	event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
+	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err_task;
@@ -5728,7 +5740,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	 * Get the target context (task or percpu):
 	 */
 
-	event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
+	event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
 		goto err;
@@ -5996,6 +6008,7 @@ inherit_event(struct perf_event *parent_event,
 
 	child_event = perf_event_alloc(&parent_event->attr,
 					   parent_event->cpu,
+					   child,
 					   group_leader, parent_event,
 					   NULL);
 	if (IS_ERR(child_event))
-- 
cgit v1.2.3-70-g09d2


From 82cd6def9806dcb6a325fb6abbc1d61388a15f6a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 17:57:23 +0200
Subject: perf: Use jump_labels to optimize the scheduler hooks

Trades a call + conditional + ret for an unconditional jmp.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101014203625.501657727@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 27 +++++++++++++++++++++++++--
 kernel/perf_event.c        | 24 +++++++++---------------
 2 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 97965fac55f..7f0e7f52af8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -487,6 +487,7 @@ struct perf_guest_info_callbacks {
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/irq_work.h>
+#include <linux/jump_label_ref.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
 
@@ -895,8 +896,30 @@ extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern int perf_num_counters(void);
 extern const char *perf_pmu_name(void);
-extern void perf_event_task_sched_in(struct task_struct *task);
-extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
+extern void __perf_event_task_sched_in(struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
+
+extern atomic_t perf_task_events;
+
+static inline void perf_event_task_sched_in(struct task_struct *task)
+{
+	JUMP_LABEL(&perf_task_events, have_events);
+	return;
+
+have_events:
+	__perf_event_task_sched_in(task);
+}
+
+static inline
+void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+{
+	JUMP_LABEL(&perf_task_events, have_events);
+	return;
+
+have_events:
+	__perf_event_task_sched_out(task, next);
+}
+
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 856e20baf13..f7febb02ab9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -34,7 +34,7 @@
 
 #include <asm/irq_regs.h>
 
-static atomic_t nr_events __read_mostly;
+atomic_t perf_task_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
@@ -1311,8 +1311,8 @@ void perf_event_context_sched_out(struct task_struct *task, int ctxn,
  * accessing the event control register. If a NMI hits, then it will
  * not restart the event.
  */
-void perf_event_task_sched_out(struct task_struct *task,
-			       struct task_struct *next)
+void __perf_event_task_sched_out(struct task_struct *task,
+				 struct task_struct *next)
 {
 	int ctxn;
 
@@ -1337,14 +1337,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 	cpuctx->task_ctx = NULL;
 }
 
-/*
- * Called with IRQs disabled
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
-	task_ctx_sched_out(ctx, EVENT_ALL);
-}
-
 /*
  * Called with IRQs disabled
  */
@@ -1494,7 +1486,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
  * accessing the event control register. If a NMI hits, then it will
  * keep the event running.
  */
-void perf_event_task_sched_in(struct task_struct *task)
+void __perf_event_task_sched_in(struct task_struct *task)
 {
 	struct perf_event_context *ctx;
 	int ctxn;
@@ -2216,7 +2208,8 @@ static void free_event(struct perf_event *event)
 	irq_work_sync(&event->pending);
 
 	if (!event->parent) {
-		atomic_dec(&nr_events);
+		if (event->attach_state & PERF_ATTACH_TASK)
+			jump_label_dec(&perf_task_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_dec(&nr_mmap_events);
 		if (event->attr.comm)
@@ -5354,7 +5347,8 @@ done:
 	event->pmu = pmu;
 
 	if (!event->parent) {
-		atomic_inc(&nr_events);
+		if (event->attach_state & PERF_ATTACH_TASK)
+			jump_label_inc(&perf_task_events);
 		if (event->attr.mmap || event->attr.mmap_data)
 			atomic_inc(&nr_mmap_events);
 		if (event->attr.comm)
@@ -5849,7 +5843,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * our context.
 	 */
 	child_ctx = child->perf_event_ctxp[ctxn];
-	__perf_event_task_sched_out(child_ctx);
+	task_ctx_sched_out(child_ctx, EVENT_ALL);
 
 	/*
 	 * Take the context lock here so that if find_get_context is
-- 
cgit v1.2.3-70-g09d2


From 7e54a5a0b655734326dc78c2b5efc1eb35497bb6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 22:32:45 +0200
Subject: perf: Optimize sw events

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 20 +++++++++++---------
 kernel/perf_event.c        |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7f0e7f52af8..3b80cbf509e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1012,18 +1012,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
-static inline void
+static __always_inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	if (atomic_read(&perf_swevent_enabled[event_id])) {
-		struct pt_regs hot_regs;
-
-		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs);
-			regs = &hot_regs;
-		}
-		__perf_sw_event(event_id, nr, nmi, regs, addr);
+	struct pt_regs hot_regs;
+
+	JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
+	return;
+
+have_event:
+	if (!regs) {
+		perf_fetch_caller_regs(&hot_regs);
+		regs = &hot_regs;
 	}
+	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f7febb02ab9..05ecf6f7c67 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4669,7 +4669,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 
 	WARN_ON(event->parent);
 
-	atomic_dec(&perf_swevent_enabled[event_id]);
+	jump_label_dec(&perf_swevent_enabled[event_id]);
 	swevent_hlist_put(event);
 }
 
@@ -4699,7 +4699,7 @@ static int perf_swevent_init(struct perf_event *event)
 		if (err)
 			return err;
 
-		atomic_inc(&perf_swevent_enabled[event_id]);
+		jump_label_inc(&perf_swevent_enabled[event_id]);
 		event->destroy = sw_perf_event_destroy;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 9ffcfa6f1f63eeac15555b745c292eb9f59130f6 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 20 Oct 2010 15:25:01 +0200
Subject: perf_events: Revert: Fix transaction recovery in group_sched_in()

This patch reverts commit 8e5fc1a (perf_events: Fix transaction
recovery in group_sched_in()) because it had one flaw in case the
group could never be scheduled. It would cause time_enabled to get
negative.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4cbeeeb7.0aefd80a.6e40.0e2f@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 76 +++++++++--------------------------------------------
 1 file changed, 13 insertions(+), 63 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c7..39afdb07d75 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)
 	return event->cpu == -1 || event->cpu == smp_processor_id();
 }
 
-static int
-__event_sched_out(struct perf_event *event,
+static void
+event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
@@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
-		return 0;
+		return;
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	if (event->pending_disable) {
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
+	event->tstamp_stopped = ctx->time;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
@@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,
 	ctx->nr_active--;
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
-	return 1;
-}
-
-static void
-event_sched_out(struct perf_event *event,
-		  struct perf_cpu_context *cpuctx,
-		  struct perf_event_context *ctx)
-{
-	int ret;
-
-	ret = __event_sched_out(event, cpuctx, ctx);
-	if (ret)
-		event->tstamp_stopped = ctx->time;
 }
 
 static void
@@ -664,7 +652,7 @@ retry:
 }
 
 static int
-__event_sched_in(struct perf_event *event,
+event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
@@ -684,6 +672,8 @@ __event_sched_in(struct perf_event *event,
 		return -EAGAIN;
 	}
 
+	event->tstamp_running += ctx->time - event->tstamp_stopped;
+
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -694,35 +684,6 @@ __event_sched_in(struct perf_event *event,
 	return 0;
 }
 
-static inline int
-event_sched_in(struct perf_event *event,
-		 struct perf_cpu_context *cpuctx,
-		 struct perf_event_context *ctx)
-{
-	int ret = __event_sched_in(event, cpuctx, ctx);
-	if (ret)
-		return ret;
-	event->tstamp_running += ctx->time - event->tstamp_stopped;
-	return 0;
-}
-
-static void
-group_commit_event_sched_in(struct perf_event *group_event,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	struct perf_event *event;
-	u64 now = ctx->time;
-
-	group_event->tstamp_running += now - group_event->tstamp_stopped;
-	/*
-	 * Schedule in siblings as one group (if any):
-	 */
-	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-		event->tstamp_running += now - event->tstamp_stopped;
-	}
-}
-
 static int
 group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
@@ -736,13 +697,7 @@ group_sched_in(struct perf_event *group_event,
 
 	pmu->start_txn(pmu);
 
-	/*
-	 * use __event_sched_in() to delay updating tstamp_running
-	 * until the transaction is committed. In case of failure
-	 * we will keep an unmodified tstamp_running which is a
-	 * requirement to get correct timing information
-	 */
-	if (__event_sched_in(group_event, cpuctx, ctx)) {
+	if (event_sched_in(group_event, cpuctx, ctx)) {
 		pmu->cancel_txn(pmu);
 		return -EAGAIN;
 	}
@@ -751,31 +706,26 @@ group_sched_in(struct perf_event *group_event,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-		if (__event_sched_in(event, cpuctx, ctx)) {
+		if (event_sched_in(event, cpuctx, ctx)) {
 			partial_group = event;
 			goto group_error;
 		}
 	}
 
-	if (!pmu->commit_txn(pmu)) {
-		/* commit tstamp_running */
-		group_commit_event_sched_in(group_event, cpuctx, ctx);
+	if (!pmu->commit_txn(pmu))
 		return 0;
-	}
+
 group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
-	 *
-	 * use __event_sched_out() to avoid updating tstamp_stopped
-	 * because the event never actually ran
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 		if (event == partial_group)
 			break;
-		__event_sched_out(event, cpuctx, ctx);
+		event_sched_out(event, cpuctx, ctx);
 	}
-	__event_sched_out(group_event, cpuctx, ctx);
+	event_sched_out(group_event, cpuctx, ctx);
 
 	pmu->cancel_txn(pmu);
 
-- 
cgit v1.2.3-70-g09d2


From d7842da470f244d258f21c5f72cd8388b3541d04 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 20 Oct 2010 15:25:01 +0200
Subject: perf_events: Fix for transaction recovery in group_sched_in()

This new version (see commit 8e5fc1a) is much simpler and ensures that
in case of error in group_sched_in() during event_sched_in(), the
events up to the failed event go through regular event_sched_out().
But the failed event and the remaining events in the group have their
timings adjusted as if they had also gone through event_sched_in() and
event_sched_out(). This ensures timing uniformity across all events in
a group. This also takes care of the tstamp_stopped problem in case
the group could never be scheduled. The tstamp_stopped is updated as
if the event had actually run.

With this patch, the following now reports correct time_enabled,
in case the NMI watchdog is active:

$ task -e unhalted_core_cycles,instructions_retired,baclears,baclears
noploop 1
noploop for 1 seconds

0 unhalted_core_cycles (100.00% scaling, ena=997,552,872, run=0)
0 instructions_retired (100.00% scaling, ena=997,552,872, run=0)
0 baclears (100.00% scaling, ena=997,552,872, run=0)
0 baclears (100.00% scaling, ena=997,552,872, run=0)

And the older test case also works:

$ task -einstructions_retired,baclears,baclears -e
unhalted_core_cycles,baclears,baclears sleep 5

1680885 instructions_retired (69.39% scaling, ena=950756, run=291006)
  10735 baclears (69.39% scaling, ena=950756, run=291006)
  10735 baclears (69.39% scaling, ena=950756, run=291006)

      0 unhalted_core_cycles (100.00% scaling, ena=817932, run=0)
      0 baclears (100.00% scaling, ena=817932, run=0)
      0 baclears (100.00% scaling, ena=817932, run=0)

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4cbeeebc.8ee7d80a.5a28.0d5f@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 39afdb07d75..517d827f498 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -691,6 +691,8 @@ group_sched_in(struct perf_event *group_event,
 {
 	struct perf_event *event, *partial_group = NULL;
 	struct pmu *pmu = group_event->pmu;
+	u64 now = ctx->time;
+	bool simulate = false;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -719,11 +721,27 @@ group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
+	 * The events up to the failed event are scheduled out normally,
+	 * tstamp_stopped will be updated.
+	 *
+	 * The failed events and the remaining siblings need to have
+	 * their timings updated as if they had gone thru event_sched_in()
+	 * and event_sched_out(). This is required to get consistent timings
+	 * across the group. This also takes care of the case where the group
+	 * could never be scheduled by ensuring tstamp_stopped is set to mark
+	 * the time the event was actually stopped, such that time delta
+	 * calculation in update_event_times() is correct.
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 		if (event == partial_group)
-			break;
-		event_sched_out(event, cpuctx, ctx);
+			simulate = true;
+
+		if (simulate) {
+			event->tstamp_running += now - event->tstamp_stopped;
+			event->tstamp_stopped = now;
+		} else {
+			event_sched_out(event, cpuctx, ctx);
+		}
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
-- 
cgit v1.2.3-70-g09d2


From eed01528a45dc4138e9a08064b4b6cc1a9426899 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 26 Oct 2010 16:08:01 +0200
Subject: perf_events: Fix time tracking in samples

This patch corrects time tracking in samples. Without this patch
both time_enabled and time_running are bogus when user asks for
PERF_SAMPLE_READ.

One uses PERF_SAMPLE_READ to sample the values of other counters
in each sample. Because of multiplexing, it is necessary to know
both time_enabled, time_running to be able to scale counts correctly.

In this second version of the patch, we maintain a shadow
copy of ctx->time which allows us to compute ctx->time without
calling update_context_time() from NMI context. We avoid the
issue that update_context_time() must always be called with
ctx->lock held.

We do not keep shadow copies of the other event timings
because if the lead event is overflowing then it is active
and thus it's been scheduled in via event_sched_in() in
which case neither tstamp_stopped, tstamp_running can be modified.

This timing logic only applies to samples when PERF_SAMPLE_READ
is used.

Note that this patch does not address timing issues related
to sampling inheritance between tasks. This will be addressed
in a future patch.

With this patch, the libpfm4 example task_smpl now reports
correct counts (shown on 2.4GHz Core 2):

$ task_smpl -p 2400000000 -e unhalted_core_cycles:u,instructions_retired:u,baclears  noploop 5
noploop for 5 seconds
IIP:0x000000004006d6 PID:5596 TID:5596 TIME:466,210,211,430 STREAM_ID:33 PERIOD:2,400,000,000 ENA=1,010,157,814 RUN=1,010,157,814 NR=3
	2,400,000,254 unhalted_core_cycles:u (33)
	2,399,273,744 instructions_retired:u (34)
	53,340 baclears (35)

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4cc6e14b.1e07e30a.256e.5190@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 10 ++++++++++
 kernel/perf_event.c        | 42 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 057bf22a832..40150f34598 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -747,6 +747,16 @@ struct perf_event {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
+	/*
+	 * timestamp shadows the actual context timing but it can
+	 * be safely used in NMI interrupt context. It reflects the
+	 * context time as it was when the event was last scheduled in.
+	 *
+	 * ctx_time already accounts for ctx->timestamp. Therefore to
+	 * compute ctx_time for a sample, simply add perf_clock().
+	 */
+	u64				shadow_ctx_time;
+
 	struct perf_event_attr		attr;
 	struct hw_perf_event		hw;
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f498..cb6c0d2af68 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -674,6 +674,8 @@ event_sched_in(struct perf_event *event,
 
 	event->tstamp_running += ctx->time - event->tstamp_stopped;
 
+	event->shadow_ctx_time = ctx->time - ctx->timestamp;
+
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -3396,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 }
 
 static void perf_output_read_one(struct perf_output_handle *handle,
-				 struct perf_event *event)
+				 struct perf_event *event,
+				 u64 enabled, u64 running)
 {
 	u64 read_format = event->attr.read_format;
 	u64 values[4];
@@ -3404,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 
 	values[n++] = perf_event_count(event);
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = event->total_time_enabled +
+		values[n++] = enabled +
 			atomic64_read(&event->child_total_time_enabled);
 	}
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = event->total_time_running +
+		values[n++] = running +
 			atomic64_read(&event->child_total_time_running);
 	}
 	if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
  */
 static void perf_output_read_group(struct perf_output_handle *handle,
-			    struct perf_event *event)
+			    struct perf_event *event,
+			    u64 enabled, u64 running)
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
@@ -3431,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	values[n++] = 1 + leader->nr_siblings;
 
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = leader->total_time_enabled;
+		values[n++] = enabled;
 
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = leader->total_time_running;
+		values[n++] = running;
 
 	if (leader != event)
 		leader->pmu->read(leader);
@@ -3459,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 	}
 }
 
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+				 PERF_FORMAT_TOTAL_TIME_RUNNING)
+
 static void perf_output_read(struct perf_output_handle *handle,
 			     struct perf_event *event)
 {
+	u64 enabled = 0, running = 0, now, ctx_time;
+	u64 read_format = event->attr.read_format;
+
+	/*
+	 * compute total_time_enabled, total_time_running
+	 * based on snapshot values taken when the event
+	 * was last scheduled in.
+	 *
+	 * we cannot simply called update_context_time()
+	 * because of locking issue as we are called in
+	 * NMI context
+	 */
+	if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+		now = perf_clock();
+		ctx_time = event->shadow_ctx_time + now;
+		enabled = ctx_time - event->tstamp_enabled;
+		running = ctx_time - event->tstamp_running;
+	}
+
 	if (event->attr.read_format & PERF_FORMAT_GROUP)
-		perf_output_read_group(handle, event);
+		perf_output_read_group(handle, event, enabled, running);
 	else
-		perf_output_read_one(handle, event);
+		perf_output_read_one(handle, event, enabled, running);
 }
 
 void perf_output_sample(struct perf_output_handle *handle,
-- 
cgit v1.2.3-70-g09d2


From 3c502e7a0255d82621ff25d60cc816624830497e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 4 Nov 2010 17:33:01 -0500
Subject: perf,hw_breakpoint: Initialize hardware api earlier

When using early debugging, the kernel does not initialize the
hw_breakpoint API early enough and causes the late initialization of
the kernel debugger to fail. The boot arguments are:

    earlyprintk=vga ekgdboc=kbd kgdbwait

Then simply type "go" at the kdb prompt and boot. The kernel will
later emit the message:

    kgdb: Could not allocate hwbreakpoints

And at that point the kernel debugger will cease to work correctly.

The solution is to initialize the hw_breakpoint at the same time that
all the other perf call backs are initialized instead of using a
core_initcall() initialization which happens well after the kernel
debugger can make use of hardware breakpoints.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4CD3396D.1090308@windriver.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/hw_breakpoint.h | 4 ++++
 kernel/hw_breakpoint.c        | 3 +--
 kernel/perf_event.c           | 6 ++++++
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index a2d6ea49ec5..d1e55fed2c7 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -33,6 +33,8 @@ enum bp_type_idx {
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
+extern int __init init_hw_breakpoint(void);
+
 static inline void hw_breakpoint_init(struct perf_event_attr *attr)
 {
 	memset(attr, 0, sizeof(*attr));
@@ -108,6 +110,8 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
 
 #else /* !CONFIG_HAVE_HW_BREAKPOINT */
 
+static inline int __init init_hw_breakpoint(void) { return 0; }
+
 static inline struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2c9120f0afc..e5325825aeb 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = {
 	.read		= hw_breakpoint_pmu_read,
 };
 
-static int __init init_hw_breakpoint(void)
+int __init init_hw_breakpoint(void)
 {
 	unsigned int **task_bp_pinned;
 	int cpu, err_cpu;
@@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void)
 
 	return -ENOMEM;
 }
-core_initcall(init_hw_breakpoint);
 
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f498..05b7d8c72c6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,6 +31,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -6295,6 +6296,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 
 void __init perf_event_init(void)
 {
+	int ret;
+
 	perf_event_init_all_cpus();
 	init_srcu_struct(&pmus_srcu);
 	perf_pmu_register(&perf_swevent);
@@ -6302,4 +6305,7 @@ void __init perf_event_init(void)
 	perf_pmu_register(&perf_task_clock);
 	perf_tp_register();
 	perf_cpu_notifier(perf_cpu_notify);
+
+	ret = init_hw_breakpoint();
+	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 }
-- 
cgit v1.2.3-70-g09d2


From 8882135bcd332f294df5455747ea43ba9e6f77ad Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 9 Nov 2010 19:01:43 +0100
Subject: perf: Fix owner-list vs exit

Oleg noticed that a perf-fd keeping a reference on the creating task
leads to a few funny side effects.

There's two different aspects to this:

  - kernel based perf-events, these should not take out
    a reference on the creating task and appear on the task's
    event list since they're not bound to fds nor visible
    to userspace.

  - fork() and pthread_create(), these can lead to the creating
    task dying (and thus the task's event-list becomming useless)
    but keeping the list and ref alive until the event is closed.

Combined they lead to malfunction of the ptrace hw_tracepoints.

Cure this by not considering kernel based perf_events for the
owner-list and destroying the owner-list when the owner dies.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <1289576883.2084.286.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 63 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 12 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f818d9d2dc9..671f6c8c8a3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2235,11 +2235,6 @@ int perf_event_release_kernel(struct perf_event *event)
 	raw_spin_unlock_irq(&ctx->lock);
 	mutex_unlock(&ctx->mutex);
 
-	mutex_lock(&event->owner->perf_event_mutex);
-	list_del_init(&event->owner_entry);
-	mutex_unlock(&event->owner->perf_event_mutex);
-	put_task_struct(event->owner);
-
 	free_event(event);
 
 	return 0;
@@ -2252,9 +2247,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
 	struct perf_event *event = file->private_data;
+	struct task_struct *owner;
 
 	file->private_data = NULL;
 
+	rcu_read_lock();
+	owner = ACCESS_ONCE(event->owner);
+	/*
+	 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+	 * !owner it means the list deletion is complete and we can indeed
+	 * free this event, otherwise we need to serialize on
+	 * owner->perf_event_mutex.
+	 */
+	smp_read_barrier_depends();
+	if (owner) {
+		/*
+		 * Since delayed_put_task_struct() also drops the last
+		 * task reference we can safely take a new reference
+		 * while holding the rcu_read_lock().
+		 */
+		get_task_struct(owner);
+	}
+	rcu_read_unlock();
+
+	if (owner) {
+		mutex_lock(&owner->perf_event_mutex);
+		/*
+		 * We have to re-check the event->owner field, if it is cleared
+		 * we raced with perf_event_exit_task(), acquiring the mutex
+		 * ensured they're done, and we can proceed with freeing the
+		 * event.
+		 */
+		if (event->owner)
+			list_del_init(&event->owner_entry);
+		mutex_unlock(&owner->perf_event_mutex);
+		put_task_struct(owner);
+	}
+
 	return perf_event_release_kernel(event);
 }
 
@@ -5678,7 +5707,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	mutex_unlock(&ctx->mutex);
 
 	event->owner = current;
-	get_task_struct(current);
+
 	mutex_lock(&current->perf_event_mutex);
 	list_add_tail(&event->owner_entry, &current->perf_event_list);
 	mutex_unlock(&current->perf_event_mutex);
@@ -5746,12 +5775,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
 
-	event->owner = current;
-	get_task_struct(current);
-	mutex_lock(&current->perf_event_mutex);
-	list_add_tail(&event->owner_entry, &current->perf_event_list);
-	mutex_unlock(&current->perf_event_mutex);
-
 	return event;
 
 err_free:
@@ -5902,8 +5925,24 @@ again:
  */
 void perf_event_exit_task(struct task_struct *child)
 {
+	struct perf_event *event, *tmp;
 	int ctxn;
 
+	mutex_lock(&child->perf_event_mutex);
+	list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+				 owner_entry) {
+		list_del_init(&event->owner_entry);
+
+		/*
+		 * Ensure the list deletion is visible before we clear
+		 * the owner, closes a race against perf_release() where
+		 * we need to serialize on the owner->perf_event_mutex.
+		 */
+		smp_wmb();
+		event->owner = NULL;
+	}
+	mutex_unlock(&child->perf_event_mutex);
+
 	for_each_task_context_nr(ctxn)
 		perf_event_exit_task_context(child, ctxn);
 }
-- 
cgit v1.2.3-70-g09d2


From dddd3379a619a4cb8247bfd3c94ca9ae3797aa2e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 Nov 2010 10:05:55 +0100
Subject: perf: Fix inherit vs. context rotation bug

It was found that sometimes children of tasks with inherited events had
one extra event. Eventually it turned out to be due to the list rotation
no being exclusive with the list iteration in the inheritance code.

Cure this by temporarily disabling the rotation while we inherit the events.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  1 +
 kernel/perf_event.c        | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 40150f34598..142e3d6042c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -850,6 +850,7 @@ struct perf_event_context {
 	int				nr_active;
 	int				is_active;
 	int				nr_stat;
+	int				rotate_disable;
 	atomic_t			refcount;
 	struct task_struct		*task;
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 671f6c8c8a3..f365dd8ef8b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1622,8 +1622,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
 {
 	raw_spin_lock(&ctx->lock);
 
-	/* Rotate the first entry last of non-pinned groups */
-	list_rotate_left(&ctx->flexible_groups);
+	/*
+	 * Rotate the first entry last of non-pinned groups. Rotation might be
+	 * disabled by the inheritance code.
+	 */
+	if (!ctx->rotate_disable)
+		list_rotate_left(&ctx->flexible_groups);
 
 	raw_spin_unlock(&ctx->lock);
 }
@@ -6162,6 +6166,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 	struct perf_event *event;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	unsigned long flags;
 	int ret = 0;
 
 	child->perf_event_ctxp[ctxn] = NULL;
@@ -6202,6 +6207,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 			break;
 	}
 
+	/*
+	 * We can't hold ctx->lock when iterating the ->flexible_group list due
+	 * to allocations, but we need to prevent rotation because
+	 * rotate_ctx() will change the list from interrupt context.
+	 */
+	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+	parent_ctx->rotate_disable = 1;
+	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
 	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
 		ret = inherit_task_group(event, parent, parent_ctx,
 					 child, ctxn, &inherited_all);
@@ -6209,6 +6223,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 			break;
 	}
 
+	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+	parent_ctx->rotate_disable = 0;
+	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
 	child_ctx = child->perf_event_ctxp[ctxn];
 
 	if (child_ctx && inherited_all) {
-- 
cgit v1.2.3-70-g09d2


From ee6dcfa40a50fe12a3ae0fb4d2653c66c3ed6556 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Nov 2010 13:49:04 +0100
Subject: perf: Fix the software context switch counter

Stephane noticed that because the perf_sw_event() call is inside the
perf_event_task_sched_out() call it won't get called unless we
have a per-task counter.

Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 29 +++++++++++++++--------------
 kernel/perf_event.c        |  2 --
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 142e3d6042c..de2c41758e2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -909,20 +909,6 @@ extern int perf_num_counters(void);
 extern const char *perf_pmu_name(void);
 extern void __perf_event_task_sched_in(struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
-
-extern atomic_t perf_task_events;
-
-static inline void perf_event_task_sched_in(struct task_struct *task)
-{
-	COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
-}
-
-static inline
-void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
-{
-	COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
-}
-
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
@@ -1031,6 +1017,21 @@ have_event:
 	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
+extern atomic_t perf_task_events;
+
+static inline void perf_event_task_sched_in(struct task_struct *task)
+{
+	COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
+}
+
+static inline
+void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+{
+	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+	COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
+}
+
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f365dd8ef8b..eac7e336433 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1287,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
 }
-- 
cgit v1.2.3-70-g09d2


From 5167695753c63444a9e6cbbef136200a16c7a225 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Dec 2010 14:18:20 +0100
Subject: perf: Fix duplicate events with multiple-pmu vs software events

Because the multi-pmu bits can share contexts between struct pmu
instances we could get duplicate events by iterating the pmu list.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  1 +
 kernel/perf_event.c        | 35 +++++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'kernel/perf_event.c')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index de2c41758e2..4f1279e105e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -887,6 +887,7 @@ struct perf_cpu_context {
 	int				exclusive;
 	struct list_head		rotation_list;
 	int				jiffies_interval;
+	struct pmu			*active_pmu;
 };
 
 struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e336433..7b870174c56 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3824,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->active_pmu != pmu)
+			goto next;
 		perf_event_task_ctx(&cpuctx->ctx, task_event);
 
 		ctx = task_event->task_ctx;
@@ -3959,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->active_pmu != pmu)
+			goto next;
 		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 
 		ctxn = pmu->task_ctx_nr;
@@ -4144,6 +4148,8 @@ got_name:
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->active_pmu != pmu)
+			goto next;
 		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
 					vma->vm_flags & VM_EXEC);
 
@@ -5145,20 +5151,36 @@ static void *find_pmu_context(int ctxn)
 	return NULL;
 }
 
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 {
-	struct pmu *pmu;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+		if (cpuctx->active_pmu == old_pmu)
+			cpuctx->active_pmu = pmu;
+	}
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+	struct pmu *i;
 
 	mutex_lock(&pmus_lock);
 	/*
 	 * Like a real lame refcount.
 	 */
-	list_for_each_entry(pmu, &pmus, entry) {
-		if (pmu->pmu_cpu_context == cpu_context)
+	list_for_each_entry(i, &pmus, entry) {
+		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+			update_pmu_context(i, pmu);
 			goto out;
+		}
 	}
 
-	free_percpu(cpu_context);
+	free_percpu(pmu->pmu_cpu_context);
 out:
 	mutex_unlock(&pmus_lock);
 }
@@ -5190,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu)
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->jiffies_interval = 1;
 		INIT_LIST_HEAD(&cpuctx->rotation_list);
+		cpuctx->active_pmu = pmu;
 	}
 
 got_cpu_context:
@@ -5241,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu)
 	synchronize_rcu();
 
 	free_percpu(pmu->pmu_disable_count);
-	free_pmu_context(pmu->pmu_cpu_context);
+	free_pmu_context(pmu);
 }
 
 struct pmu *perf_init_event(struct perf_event *event)
-- 
cgit v1.2.3-70-g09d2


From ce677831a4abd0f9f957c90ac6f6a0d0472bafb4 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sun, 24 Oct 2010 21:50:42 +0200
Subject: perf: Fix off by one in perf_swevent_init()

The perf_swevent_enabled[] array has PERF_COUNT_SW_MAX elements.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101024195041.GT5985@bicker>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/perf_event.c')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7b870174c56..2870feee81d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4719,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event)
 		break;
 	}
 
-	if (event_id > PERF_COUNT_SW_MAX)
+	if (event_id >= PERF_COUNT_SW_MAX)
 		return -ENOENT;
 
 	if (!event->parent) {
-- 
cgit v1.2.3-70-g09d2