perf events: Fix slow and broken cgroup context switch code

The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 <not counted> cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 <not counted> cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Stephane Eranian <eranian@google.com> 2011-08-25 15:58:03 +0200
committer: Ingo Molnar <mingo@elte.hu> 2011-08-29 12:28:33 +0200
commit: a8d757ef076f0f95f13a918808824058de25b3eb (patch)
tree: 3c1151ef886d9b72d0a7b7b267d9f37c72d5f475 /include
parent: c6a389f123b9f68d605bb7e0f9b32ec1e3e14132 (diff)
1 files changed, 15 insertions, 9 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 245bafdafd5..c816075c01c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,8 +944,10 @@ extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern int perf_num_counters(void);
 extern const char *perf_pmu_name(void);
-extern void __perf_event_task_sched_in(struct task_struct *task);
-extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
+extern void __perf_event_task_sched_in(struct task_struct *prev,
+				       struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *prev,
+					struct task_struct *next);
 extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
@@ -1059,17 +1061,20 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 
 extern struct jump_label_key perf_sched_events;
 
-static inline void perf_event_task_sched_in(struct task_struct *task)
+static inline void perf_event_task_sched_in(struct task_struct *prev,
+					    struct task_struct *task)
 {
 	if (static_branch(&perf_sched_events))
-		__perf_event_task_sched_in(task);
+		__perf_event_task_sched_in(prev, task);
 }
 
-static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+static inline void perf_event_task_sched_out(struct task_struct *prev,
+					     struct task_struct *next)
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
 
-	__perf_event_task_sched_out(task, next);
+	if (static_branch(&perf_sched_events))
+		__perf_event_task_sched_out(prev, next);
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -1139,10 +1144,11 @@ extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_task_tick(void);
 #else
 static inline void
-perf_event_task_sched_in(struct task_struct *task)			{ }
+perf_event_task_sched_in(struct task_struct *prev,
+			 struct task_struct *task)			{ }
 static inline void
-perf_event_task_sched_out(struct task_struct *task,
-			    struct task_struct *next)			{ }
+perf_event_task_sched_out(struct task_struct *prev,
+			  struct task_struct *next)			{ }
 static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
author	Stephane Eranian <eranian@google.com>	2011-08-25 15:58:03 +0200
committer	Ingo Molnar <mingo@elte.hu>	2011-08-29 12:28:33 +0200
commit	a8d757ef076f0f95f13a918808824058de25b3eb (patch)
tree	3c1151ef886d9b72d0a7b7b267d9f37c72d5f475 /include
parent	c6a389f123b9f68d605bb7e0f9b32ec1e3e14132 (diff)