16 files changed, 2206 insertions, 1103 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be..d52b473c99a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-	    async.o range.o
+	    async.o range.o jump_label.o
 obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db2..e2bdf37f9fd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
-#ifdef CONFIG_PERF_EVENTS
-	WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
+	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c7c2aed9e2d..3b714e839c1 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_overflow_handler_t triggered,
 			    struct task_struct *tsk)
 {
-	return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
-						triggered);
+	return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+		bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
 
 		*pevent = bp;
 
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 	.priority = 0x7fffffff
 };
 
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+	release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+	int err;
+
+	if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+		return -ENOENT;
+
+	err = register_perf_hw_breakpoint(bp);
+	if (err)
+		return err;
+
+	bp->destroy = bp_perf_event_destroy;
+
+	return 0;
+}
+
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+	if (!(flags & PERF_EF_START))
+		bp->hw.state = PERF_HES_STOPPED;
+
+	return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+	arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+	bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+	bp->hw.state = PERF_HES_STOPPED;
+}
+
+static struct pmu perf_breakpoint = {
+	.task_ctx_nr	= perf_sw_context, /* could eventually get its own */
+
+	.event_init	= hw_breakpoint_event_init,
+	.add		= hw_breakpoint_add,
+	.del		= hw_breakpoint_del,
+	.start		= hw_breakpoint_start,
+	.stop		= hw_breakpoint_stop,
+	.read		= hw_breakpoint_pmu_read,
+};
+
 static int __init init_hw_breakpoint(void)
 {
 	unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
 
 	constraints_initialized = 1;
 
+	perf_pmu_register(&perf_breakpoint);
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
 
  err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
 core_initcall(init_hw_breakpoint);
 
 
-struct pmu perf_ops_bp = {
-	.enable		= arch_install_hw_breakpoint,
-	.disable	= arch_uninstall_hw_breakpoint,
-	.read		= hw_breakpoint_pmu_read,
-};
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 00000000000..7be868bf25c
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+#define JUMP_LABEL_HASH_BITS 6
+#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
+static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
+
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+
+struct jump_label_entry {
+	struct hlist_node hlist;
+	struct jump_entry *table;
+	int nr_entries;
+	/* hang modules off here */
+	struct hlist_head modules;
+	unsigned long key;
+};
+
+struct jump_label_module_entry {
+	struct hlist_node hlist;
+	struct jump_entry *table;
+	int nr_entries;
+	struct module *mod;
+};
+
+static int jump_label_cmp(const void *a, const void *b)
+{
+	const struct jump_entry *jea = a;
+	const struct jump_entry *jeb = b;
+
+	if (jea->key < jeb->key)
+		return -1;
+
+	if (jea->key > jeb->key)
+		return 1;
+
+	return 0;
+}
+
+static void
+sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+	unsigned long size;
+
+	size = (((unsigned long)stop - (unsigned long)start)
+					/ sizeof(struct jump_entry));
+	sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+
+static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct jump_label_entry *e;
+	u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+
+	head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (key == e->key)
+			return e;
+	}
+	return NULL;
+}
+
+static struct jump_label_entry *
+add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+{
+	struct hlist_head *head;
+	struct jump_label_entry *e;
+	u32 hash;
+
+	e = get_jump_label_entry(key);
+	if (e)
+		return ERR_PTR(-EEXIST);
+
+	e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+
+	hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+	head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+	e->key = key;
+	e->table = table;
+	e->nr_entries = nr_entries;
+	INIT_HLIST_HEAD(&(e->modules));
+	hlist_add_head(&e->hlist, head);
+	return e;
+}
+
+static int
+build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
+{
+	struct jump_entry *iter, *iter_begin;
+	struct jump_label_entry *entry;
+	int count;
+
+	sort_jump_label_entries(start, stop);
+	iter = start;
+	while (iter < stop) {
+		entry = get_jump_label_entry(iter->key);
+		if (!entry) {
+			iter_begin = iter;
+			count = 0;
+			while ((iter < stop) &&
+				(iter->key == iter_begin->key)) {
+				iter++;
+				count++;
+			}
+			entry = add_jump_label_entry(iter_begin->key,
+							count, iter_begin);
+			if (IS_ERR(entry))
+				return PTR_ERR(entry);
+		 } else {
+			WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/***
+ * jump_label_update - update jump label text
+ * @key -  key value associated with a a jump label
+ * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
+ *
+ * Will enable/disable the jump for jump label @key, depending on the
+ * value of @type.
+ *
+ */
+
+void jump_label_update(unsigned long key, enum jump_label_type type)
+{
+	struct jump_entry *iter;
+	struct jump_label_entry *entry;
+	struct hlist_node *module_node;
+	struct jump_label_module_entry *e_module;
+	int count;
+
+	mutex_lock(&jump_label_mutex);
+	entry = get_jump_label_entry((jump_label_t)key);
+	if (entry) {
+		count = entry->nr_entries;
+		iter = entry->table;
+		while (count--) {
+			if (kernel_text_address(iter->code))
+				arch_jump_label_transform(iter, type);
+			iter++;
+		}
+		/* eanble/disable jump labels in modules */
+		hlist_for_each_entry(e_module, module_node, &(entry->modules),
+							hlist) {
+			count = e_module->nr_entries;
+			iter = e_module->table;
+			while (count--) {
+				if (kernel_text_address(iter->code))
+					arch_jump_label_transform(iter, type);
+				iter++;
+			}
+		}
+	}
+	mutex_unlock(&jump_label_mutex);
+}
+
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+	if (entry->code <= (unsigned long)end &&
+		entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+		return 1;
+
+	return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int module_conflict(void *start, void *end)
+{
+	struct hlist_head *head;
+	struct hlist_node *node, *node_next, *module_node, *module_node_next;
+	struct jump_label_entry *e;
+	struct jump_label_module_entry *e_module;
+	struct jump_entry *iter;
+	int i, count;
+	int conflict = 0;
+
+	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+		head = &jump_label_table[i];
+		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+			hlist_for_each_entry_safe(e_module, module_node,
+							module_node_next,
+							&(e->modules), hlist) {
+				count = e_module->nr_entries;
+				iter = e_module->table;
+				while (count--) {
+					if (addr_conflict(iter, start, end)) {
+						conflict = 1;
+						goto out;
+					}
+					iter++;
+				}
+			}
+		}
+	}
+out:
+	return conflict;
+}
+
+#endif
+
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+	struct jump_entry *iter;
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __start___jump_table;
+	int conflict = 0;
+
+	mutex_lock(&jump_label_mutex);
+	iter = iter_start;
+	while (iter < iter_stop) {
+		if (addr_conflict(iter, start, end)) {
+			conflict = 1;
+			goto out;
+		}
+		iter++;
+	}
+
+	/* now check modules */
+#ifdef CONFIG_MODULES
+	conflict = module_conflict(start, end);
+#endif
+out:
+	mutex_unlock(&jump_label_mutex);
+	return conflict;
+}
+
+static __init int init_jump_label(void)
+{
+	int ret;
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __stop___jump_table;
+	struct jump_entry *iter;
+
+	mutex_lock(&jump_label_mutex);
+	ret = build_jump_label_hashtable(__start___jump_table,
+					 __stop___jump_table);
+	iter = iter_start;
+	while (iter < iter_stop) {
+		arch_jump_label_text_poke_early(iter->code);
+		iter++;
+	}
+	mutex_unlock(&jump_label_mutex);
+	return ret;
+}
+early_initcall(init_jump_label);
+
+#ifdef CONFIG_MODULES
+
+static struct jump_label_module_entry *
+add_jump_label_module_entry(struct jump_label_entry *entry,
+			    struct jump_entry *iter_begin,
+			    int count, struct module *mod)
+{
+	struct jump_label_module_entry *e;
+
+	e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+	e->mod = mod;
+	e->nr_entries = count;
+	e->table = iter_begin;
+	hlist_add_head(&e->hlist, &entry->modules);
+	return e;
+}
+
+static int add_jump_label_module(struct module *mod)
+{
+	struct jump_entry *iter, *iter_begin;
+	struct jump_label_entry *entry;
+	struct jump_label_module_entry *module_entry;
+	int count;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (!mod->num_jump_entries)
+		return 0;
+
+	sort_jump_label_entries(mod->jump_entries,
+				mod->jump_entries + mod->num_jump_entries);
+	iter = mod->jump_entries;
+	while (iter < mod->jump_entries + mod->num_jump_entries) {
+		entry = get_jump_label_entry(iter->key);
+		iter_begin = iter;
+		count = 0;
+		while ((iter < mod->jump_entries + mod->num_jump_entries) &&
+			(iter->key == iter_begin->key)) {
+				iter++;
+				count++;
+		}
+		if (!entry) {
+			entry = add_jump_label_entry(iter_begin->key, 0, NULL);
+			if (IS_ERR(entry))
+				return PTR_ERR(entry);
+		}
+		module_entry = add_jump_label_module_entry(entry, iter_begin,
+							   count, mod);
+		if (IS_ERR(module_entry))
+			return PTR_ERR(module_entry);
+	}
+	return 0;
+}
+
+static void remove_jump_label_module(struct module *mod)
+{
+	struct hlist_head *head;
+	struct hlist_node *node, *node_next, *module_node, *module_node_next;
+	struct jump_label_entry *e;
+	struct jump_label_module_entry *e_module;
+	int i;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (!mod->num_jump_entries)
+		return;
+
+	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+		head = &jump_label_table[i];
+		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+			hlist_for_each_entry_safe(e_module, module_node,
+						  module_node_next,
+						  &(e->modules), hlist) {
+				if (e_module->mod == mod) {
+					hlist_del(&e_module->hlist);
+					kfree(e_module);
+				}
+			}
+			if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
+				hlist_del(&e->hlist);
+				kfree(e);
+			}
+		}
+	}
+}
+
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+			 void *data)
+{
+	struct module *mod = data;
+	int ret = 0;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		mutex_lock(&jump_label_mutex);
+		ret = add_jump_label_module(mod);
+		if (ret)
+			remove_jump_label_module(mod);
+		mutex_unlock(&jump_label_mutex);
+		break;
+	case MODULE_STATE_GOING:
+		mutex_lock(&jump_label_mutex);
+		remove_jump_label_module(mod);
+		mutex_unlock(&jump_label_mutex);
+		break;
+	}
+	return ret;
+}
+
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+	struct jump_entry *iter;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (!mod->num_jump_entries)
+		return;
+
+	iter = mod->jump_entries;
+	while (iter < mod->jump_entries + mod->num_jump_entries) {
+		arch_jump_label_text_poke_early(iter->code);
+		iter++;
+	}
+}
+
+struct notifier_block jump_label_module_nb = {
+	.notifier_call = jump_label_module_notify,
+	.priority = 0,
+};
+
+static __init int init_jump_label_module(void)
+{
+	return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(init_jump_label_module);
+
+#endif /* CONFIG_MODULES */
+
+#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae9..ec4210c6501 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
 #include <linux/memory.h>
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
+#include <linux/jump_label.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
  * Return an optimized kprobe whose optimizing code replaces
  * instructions including addr (exclude breakpoint).
  */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 {
 	int i;
 	struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 
 void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 			 struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
 	spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 
 static void __kprobes kretprobe_table_lock(unsigned long hash,
 	unsigned long *flags)
+__acquires(hlist_lock)
 {
 	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
 	spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
 
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 	unsigned long *flags)
+__releases(hlist_lock)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
 	spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 	spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
 {
 	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
 	spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 	preempt_disable();
 	if (!kernel_text_address((unsigned long) p->addr) ||
 	    in_kprobes_functions((unsigned long) p->addr) ||
-	    ftrace_text_reserved(p->addr, p->addr)) {
+	    ftrace_text_reserved(p->addr, p->addr) ||
+	    jump_label_text_reserved(p->addr, p->addr)) {
 		preempt_enable();
 		return -EINVAL;
 	}
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
 	if (num <= 0)
 		return -EINVAL;
 	for (i = 0; i < num; i++) {
-		unsigned long addr;
+		unsigned long addr, offset;
 		jp = jps[i];
 		addr = arch_deref_entry_point(jp->entry);
 
-		if (!kernel_text_address(addr))
-			ret = -EINVAL;
-		else {
-			/* Todo: Verify probepoint is a function entry point */
+		/* Verify probepoint is a function entry point */
+		if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
+		    offset == 0) {
 			jp->kp.pre_handler = setjmp_pre_handler;
 			jp->kp.break_handler = longjmp_break_handler;
 			ret = register_kprobe(&jp->kp);
-		}
+		} else
+			ret = -EINVAL;
+
 		if (ret < 0) {
 			if (i > 0)
 				unregister_jprobes(jps, i);
diff --git a/kernel/module.c b/kernel/module.c
index d0b5f8db11b..eba134157ef 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/jump_label.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -2308,6 +2309,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 					sizeof(*mod->tracepoints),
 					&mod->num_tracepoints);
 #endif
+#ifdef HAVE_JUMP_LABEL
+	mod->jump_entries = section_objs(info, "__jump_table",
+					sizeof(*mod->jump_entries),
+					&mod->num_jump_entries);
+#endif
 #ifdef CONFIG_EVENT_TRACING
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index db5b5606468..64507eaa2d9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
-/*
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
 static atomic_t nr_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
 /*
  * perf event paranoia level:
  *  -1 - not paranoid at all
@@ -67,36 +61,38 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_event_id;
 
-/*
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
+void __weak perf_event_print_debug(void)	{ }
 
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+void perf_pmu_disable(struct pmu *pmu)
 {
-	return NULL;
+	int *count = this_cpu_ptr(pmu->pmu_disable_count);
+	if (!(*count)++)
+		pmu->pmu_disable(pmu);
 }
 
-void __weak hw_perf_disable(void)		{ barrier(); }
-void __weak hw_perf_enable(void)		{ barrier(); }
-
-void __weak perf_event_print_debug(void)	{ }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void perf_disable(void)
+void perf_pmu_enable(struct pmu *pmu)
 {
-	if (!__get_cpu_var(perf_disable_count)++)
-		hw_perf_disable();
+	int *count = this_cpu_ptr(pmu->pmu_disable_count);
+	if (!--(*count))
+		pmu->pmu_enable(pmu);
 }
 
-void perf_enable(void)
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
 {
-	if (!--__get_cpu_var(perf_disable_count))
-		hw_perf_enable();
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+	struct list_head *head = &__get_cpu_var(rotation_list);
+
+	WARN_ON(!irqs_disabled());
+
+	if (list_empty(&cpuctx->rotation_list))
+		list_add(&cpuctx->rotation_list, head);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +147,13 @@ static u64 primary_event_id(struct perf_event *event)
  * the context could get moved to another task.
  */
 static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
 	struct perf_event_context *ctx;
 
 	rcu_read_lock();
- retry:
-	ctx = rcu_dereference(task->perf_event_ctxp);
+retry:
+	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
 	if (ctx) {
 		/*
 		 * If this context is a clone of another, it might
@@ -170,7 +166,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 		 * can't get swapped on us any more.
 		 */
 		raw_spin_lock_irqsave(&ctx->lock, *flags);
-		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
 			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 			goto retry;
 		}
@@ -189,12 +185,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
  * can't get swapped to another task.  This also increments its
  * reference count so that the context can't get freed.
  */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
 {
 	struct perf_event_context *ctx;
 	unsigned long flags;
 
-	ctx = perf_lock_task_context(task, &flags);
+	ctx = perf_lock_task_context(task, ctxn, &flags);
 	if (ctx) {
 		++ctx->pin_count;
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +299,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	}
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
+	if (!ctx->nr_events)
+		perf_pmu_rotate_start(ctx->pmu);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
@@ -436,7 +435,7 @@ event_sched_out(struct perf_event *event,
 		event->state = PERF_EVENT_STATE_OFF;
 	}
 	event->tstamp_stopped = ctx->time;
-	event->pmu->disable(event);
+	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
 	if (!is_software_event(event))
@@ -466,6 +465,12 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+	return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -474,9 +479,9 @@ group_sched_out(struct perf_event *group_event,
  */
 static void __perf_event_remove_from_context(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -487,27 +492,11 @@ static void __perf_event_remove_from_context(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * events on a global level.
-	 */
-	perf_disable();
 
 	event_sched_out(event, cpuctx, ctx);
 
 	list_del_event(event, ctx);
 
-	if (!ctx->task) {
-		/*
-		 * Allow more per task events with respect to the
-		 * reservation:
-		 */
-		cpuctx->max_pertask =
-			min(perf_max_events - ctx->nr_events,
-			    perf_max_events - perf_reserved_percpu);
-	}
-
-	perf_enable();
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -572,8 +561,8 @@ retry:
 static void __perf_event_disable(void *info)
 {
 	struct perf_event *event = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
 	/*
 	 * If this is a per-task event, need to check whether this
@@ -628,7 +617,7 @@ void perf_event_disable(struct perf_event *event)
 		return;
 	}
 
- retry:
+retry:
 	task_oncpu_function_call(task, __perf_event_disable, event);
 
 	raw_spin_lock_irq(&ctx->lock);
@@ -667,7 +656,7 @@ event_sched_in(struct perf_event *event,
 	 */
 	smp_wmb();
 
-	if (event->pmu->enable(event)) {
+	if (event->pmu->add(event, PERF_EF_START)) {
 		event->state = PERF_EVENT_STATE_INACTIVE;
 		event->oncpu = -1;
 		return -EAGAIN;
@@ -691,22 +680,15 @@ group_sched_in(struct perf_event *group_event,
 	       struct perf_event_context *ctx)
 {
 	struct perf_event *event, *partial_group = NULL;
-	const struct pmu *pmu = group_event->pmu;
-	bool txn = false;
+	struct pmu *pmu = group_event->pmu;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	/* Check if group transaction availabe */
-	if (pmu->start_txn)
-		txn = true;
-
-	if (txn)
-		pmu->start_txn(pmu);
+	pmu->start_txn(pmu);
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
-		if (txn)
-			pmu->cancel_txn(pmu);
+		pmu->cancel_txn(pmu);
 		return -EAGAIN;
 	}
 
@@ -720,7 +702,7 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (!txn || !pmu->commit_txn(pmu))
+	if (!pmu->commit_txn(pmu))
 		return 0;
 
 group_error:
@@ -735,8 +717,7 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
-	if (txn)
-		pmu->cancel_txn(pmu);
+	pmu->cancel_txn(pmu);
 
 	return -EAGAIN;
 }
@@ -789,10 +770,10 @@ static void add_event_to_ctx(struct perf_event *event,
  */
 static void __perf_install_in_context(void *info)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	int err;
 
 	/*
@@ -812,12 +793,6 @@ static void __perf_install_in_context(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * events on a global level. NOP for non NMI based events.
-	 */
-	perf_disable();
-
 	add_event_to_ctx(event, ctx);
 
 	if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +830,7 @@ static void __perf_install_in_context(void *info)
 		}
 	}
 
-	if (!err && !ctx->task && cpuctx->max_pertask)
-		cpuctx->max_pertask--;
-
- unlock:
-	perf_enable();
-
+unlock:
 	raw_spin_unlock(&ctx->lock);
 }
 
@@ -883,6 +853,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
+	event->ctx = ctx;
+
 	if (!task) {
 		/*
 		 * Per cpu events are installed via an smp call and
@@ -931,10 +903,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
 	event->tstamp_enabled = ctx->time - event->total_time_enabled;
-	list_for_each_entry(sub, &event->sibling_list, group_entry)
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+	list_for_each_entry(sub, &event->sibling_list, group_entry) {
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 			sub->tstamp_enabled =
 				ctx->time - sub->total_time_enabled;
+		}
+	}
 }
 
 /*
@@ -943,9 +917,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 static void __perf_event_enable(void *info)
 {
 	struct perf_event *event = info;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *leader = event->group_leader;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);